175 lines
3.6 KiB
Go
175 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"net/url"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// Icon represents a discovered favicon link.
|
|
type Icon struct {
|
|
URL string
|
|
Source string // "favicon_ico" or "link_rel"
|
|
RelType string // type attribute from <link> (e.g., "image/png")
|
|
RelSizes string // sizes attribute from <link> (e.g., "32x32")
|
|
}
|
|
|
|
// ParseResult holds extracted data from HTML parsing.
|
|
type ParseResult struct {
|
|
Title string
|
|
Icons []Icon
|
|
}
|
|
|
|
// ParseHTML extracts the title and link rel="icon" tags from HTML.
|
|
// Uses a lenient tokenizer approach that handles malformed HTML.
|
|
func ParseHTML(body []byte, protocol, hostname string) ParseResult {
|
|
result := ParseResult{}
|
|
tokenizer := html.NewTokenizer(strings.NewReader(string(body)))
|
|
|
|
inTitle := false
|
|
var titleBuilder strings.Builder
|
|
|
|
for {
|
|
tt := tokenizer.Next()
|
|
switch tt {
|
|
case html.ErrorToken:
|
|
// End of document or parse error — return what we have
|
|
result.Title = cleanTitle(titleBuilder.String())
|
|
return result
|
|
|
|
case html.StartTagToken, html.SelfClosingTagToken:
|
|
tn, hasAttr := tokenizer.TagName()
|
|
tagName := string(tn)
|
|
|
|
if tagName == "title" && tt == html.StartTagToken {
|
|
inTitle = true
|
|
continue
|
|
}
|
|
|
|
if tagName == "link" && hasAttr {
|
|
icon := parseLinkTag(tokenizer, protocol, hostname)
|
|
if icon != nil {
|
|
result.Icons = append(result.Icons, *icon)
|
|
}
|
|
}
|
|
|
|
// Stop parsing after </head> to save time — icons and title are in <head>
|
|
if tagName == "body" {
|
|
result.Title = cleanTitle(titleBuilder.String())
|
|
return result
|
|
}
|
|
|
|
case html.EndTagToken:
|
|
tn, _ := tokenizer.TagName()
|
|
if string(tn) == "title" {
|
|
inTitle = false
|
|
}
|
|
|
|
case html.TextToken:
|
|
if inTitle {
|
|
titleBuilder.Write(tokenizer.Text())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// parseLinkTag extracts icon info from a <link> tag if it's a favicon.
|
|
func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon {
|
|
var rel, href, typ, sizes string
|
|
|
|
for {
|
|
key, val, more := tokenizer.TagAttr()
|
|
k := string(key)
|
|
v := string(val)
|
|
|
|
switch k {
|
|
case "rel":
|
|
rel = strings.ToLower(v)
|
|
case "href":
|
|
href = v
|
|
case "type":
|
|
typ = strings.ToLower(v)
|
|
case "sizes":
|
|
sizes = strings.ToLower(v)
|
|
}
|
|
|
|
if !more {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Only interested in icon links
|
|
if !strings.Contains(rel, "icon") {
|
|
return nil
|
|
}
|
|
|
|
if href == "" {
|
|
return nil
|
|
}
|
|
|
|
// Resolve relative URLs
|
|
resolvedURL := resolveURL(href, protocol, hostname)
|
|
if resolvedURL == "" {
|
|
return nil
|
|
}
|
|
|
|
return &Icon{
|
|
URL: resolvedURL,
|
|
Source: "link_rel",
|
|
RelType: typ,
|
|
RelSizes: sizes,
|
|
}
|
|
}
|
|
|
|
// resolveURL resolves a potentially relative icon URL against the host's base URL.
|
|
func resolveURL(href, protocol, hostname string) string {
|
|
href = strings.TrimSpace(href)
|
|
if href == "" {
|
|
return ""
|
|
}
|
|
|
|
// Skip data: URIs
|
|
if strings.HasPrefix(href, "data:") {
|
|
return ""
|
|
}
|
|
|
|
// Already absolute
|
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
|
return href
|
|
}
|
|
|
|
// Protocol-relative
|
|
if strings.HasPrefix(href, "//") {
|
|
return protocol + ":" + href
|
|
}
|
|
|
|
// Relative to root
|
|
base := protocol + "://" + hostname
|
|
if strings.HasPrefix(href, "/") {
|
|
return base + href
|
|
}
|
|
|
|
// Relative path — resolve against root
|
|
parsed, err := url.Parse(href)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
baseParsed, err := url.Parse(base + "/")
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return baseParsed.ResolveReference(parsed).String()
|
|
}
|
|
|
|
// cleanTitle trims whitespace and truncates to 512 chars.
|
|
func cleanTitle(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
// Collapse internal whitespace
|
|
fields := strings.Fields(s)
|
|
s = strings.Join(fields, " ")
|
|
if len(s) > 512 {
|
|
s = s[:512]
|
|
}
|
|
return s
|
|
}
|