added warc parser

This commit is contained in:
Joe Lothan 2026-05-17 20:25:59 -04:00
parent db81015e0b
commit f45e4a6034
8 changed files with 954 additions and 0 deletions

View file

@ -0,0 +1,175 @@
package main
import (
"net/url"
"strings"
"golang.org/x/net/html"
)
// Icon represents a discovered favicon link.
type Icon struct {
URL string
Source string // "favicon_ico" or "link_rel"
RelType string // type attribute from <link> (e.g., "image/png")
RelSizes string // sizes attribute from <link> (e.g., "32x32")
}
// ParseResult holds extracted data from HTML parsing.
type ParseResult struct {
Title string
Icons []Icon
}
// ParseHTML extracts the title and link rel="icon" tags from HTML.
// Uses a lenient tokenizer approach that handles malformed HTML.
func ParseHTML(body []byte, protocol, hostname string) ParseResult {
result := ParseResult{}
tokenizer := html.NewTokenizer(strings.NewReader(string(body)))
inTitle := false
var titleBuilder strings.Builder
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
// End of document or parse error — return what we have
result.Title = cleanTitle(titleBuilder.String())
return result
case html.StartTagToken, html.SelfClosingTagToken:
tn, hasAttr := tokenizer.TagName()
tagName := string(tn)
if tagName == "title" && tt == html.StartTagToken {
inTitle = true
continue
}
if tagName == "link" && hasAttr {
icon := parseLinkTag(tokenizer, protocol, hostname)
if icon != nil {
result.Icons = append(result.Icons, *icon)
}
}
// Stop parsing after </head> to save time — icons and title are in <head>
if tagName == "body" {
result.Title = cleanTitle(titleBuilder.String())
return result
}
case html.EndTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "title" {
inTitle = false
}
case html.TextToken:
if inTitle {
titleBuilder.Write(tokenizer.Text())
}
}
}
}
// parseLinkTag extracts icon info from a <link> tag if it's a favicon.
func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon {
var rel, href, typ, sizes string
for {
key, val, more := tokenizer.TagAttr()
k := string(key)
v := string(val)
switch k {
case "rel":
rel = strings.ToLower(v)
case "href":
href = v
case "type":
typ = strings.ToLower(v)
case "sizes":
sizes = strings.ToLower(v)
}
if !more {
break
}
}
// Only interested in icon links
if !strings.Contains(rel, "icon") {
return nil
}
if href == "" {
return nil
}
// Resolve relative URLs
resolvedURL := resolveURL(href, protocol, hostname)
if resolvedURL == "" {
return nil
}
return &Icon{
URL: resolvedURL,
Source: "link_rel",
RelType: typ,
RelSizes: sizes,
}
}
// resolveURL resolves a potentially relative icon URL against the host's base URL.
func resolveURL(href, protocol, hostname string) string {
href = strings.TrimSpace(href)
if href == "" {
return ""
}
// Skip data: URIs
if strings.HasPrefix(href, "data:") {
return ""
}
// Already absolute
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
// Protocol-relative
if strings.HasPrefix(href, "//") {
return protocol + ":" + href
}
// Relative to root
base := protocol + "://" + hostname
if strings.HasPrefix(href, "/") {
return base + href
}
// Relative path — resolve against root
parsed, err := url.Parse(href)
if err != nil {
return ""
}
baseParsed, err := url.Parse(base + "/")
if err != nil {
return ""
}
return baseParsed.ResolveReference(parsed).String()
}
// cleanTitle trims whitespace and truncates to 512 chars.
func cleanTitle(s string) string {
s = strings.TrimSpace(s)
// Collapse internal whitespace
fields := strings.Fields(s)
s = strings.Join(fields, " ")
if len(s) > 512 {
s = s[:512]
}
return s
}