package main import ( "net/url" "strings" "golang.org/x/net/html" ) // Icon represents a discovered favicon link. type Icon struct { URL string Source string // "favicon_ico" or "link_rel" RelType string // type attribute from (e.g., "image/png") RelSizes string // sizes attribute from (e.g., "32x32") } // ParseResult holds extracted data from HTML parsing. type ParseResult struct { Title string Icons []Icon } // ParseHTML extracts the title and link rel="icon" tags from HTML. // Uses a lenient tokenizer approach that handles malformed HTML. func ParseHTML(body []byte, protocol, hostname string) ParseResult { result := ParseResult{} tokenizer := html.NewTokenizer(strings.NewReader(string(body))) inTitle := false var titleBuilder strings.Builder for { tt := tokenizer.Next() switch tt { case html.ErrorToken: // End of document or parse error — return what we have result.Title = cleanTitle(titleBuilder.String()) return result case html.StartTagToken, html.SelfClosingTagToken: tn, hasAttr := tokenizer.TagName() tagName := string(tn) if tagName == "title" && tt == html.StartTagToken { inTitle = true continue } if tagName == "link" && hasAttr && len(result.Icons) < 50 { icon := parseLinkTag(tokenizer, protocol, hostname) if icon != nil { result.Icons = append(result.Icons, *icon) } } // Stop parsing after to save time — icons and title are in if tagName == "body" { result.Title = cleanTitle(titleBuilder.String()) return result } case html.EndTagToken: tn, _ := tokenizer.TagName() if string(tn) == "title" { inTitle = false } case html.TextToken: if inTitle { titleBuilder.Write(tokenizer.Text()) } } } } // parseLinkTag extracts icon info from a tag if it's a favicon. func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon { var rel, href, typ, sizes string for { key, val, more := tokenizer.TagAttr() k := string(key) v := string(val) switch k { case "rel": rel = strings.ToLower(v) case "href": href = v case "type": typ = strings.ToLower(v) case "sizes": sizes = strings.ToLower(v) } if !more { break } } // Only interested in icon links if !strings.Contains(rel, "icon") { return nil } if href == "" { return nil } // Resolve relative URLs resolvedURL := resolveURL(href, protocol, hostname) if resolvedURL == "" { return nil } return &Icon{ URL: resolvedURL, Source: "link_rel", RelType: typ, RelSizes: sizes, } } // resolveURL resolves a potentially relative icon URL against the host's base URL. func resolveURL(href, protocol, hostname string) string { href = strings.TrimSpace(href) if href == "" { return "" } // Skip data: URIs if strings.HasPrefix(href, "data:") { return "" } // Already absolute if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { return href } // Protocol-relative if strings.HasPrefix(href, "//") { return protocol + ":" + href } // Relative to root base := protocol + "://" + hostname if strings.HasPrefix(href, "/") { return base + href } // Relative path — resolve against root parsed, err := url.Parse(href) if err != nil { return "" } baseParsed, err := url.Parse(base + "/") if err != nil { return "" } return baseParsed.ResolveReference(parsed).String() } // cleanTitle trims whitespace and truncates to 512 chars. func cleanTitle(s string) string { s = strings.TrimSpace(s) // Collapse internal whitespace fields := strings.Fields(s) s = strings.Join(fields, " ") if len(s) > 512 { s = s[:512] } return s }