added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
175
pipeline/02_warc_parse/parser.go
Normal file
175
pipeline/02_warc_parse/parser.go
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Icon represents a discovered favicon link.
|
||||
type Icon struct {
|
||||
URL string
|
||||
Source string // "favicon_ico" or "link_rel"
|
||||
RelType string // type attribute from <link> (e.g., "image/png")
|
||||
RelSizes string // sizes attribute from <link> (e.g., "32x32")
|
||||
}
|
||||
|
||||
// ParseResult holds extracted data from HTML parsing.
|
||||
type ParseResult struct {
|
||||
Title string
|
||||
Icons []Icon
|
||||
}
|
||||
|
||||
// ParseHTML extracts the title and link rel="icon" tags from HTML.
|
||||
// Uses a lenient tokenizer approach that handles malformed HTML.
|
||||
func ParseHTML(body []byte, protocol, hostname string) ParseResult {
|
||||
result := ParseResult{}
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(string(body)))
|
||||
|
||||
inTitle := false
|
||||
var titleBuilder strings.Builder
|
||||
|
||||
for {
|
||||
tt := tokenizer.Next()
|
||||
switch tt {
|
||||
case html.ErrorToken:
|
||||
// End of document or parse error — return what we have
|
||||
result.Title = cleanTitle(titleBuilder.String())
|
||||
return result
|
||||
|
||||
case html.StartTagToken, html.SelfClosingTagToken:
|
||||
tn, hasAttr := tokenizer.TagName()
|
||||
tagName := string(tn)
|
||||
|
||||
if tagName == "title" && tt == html.StartTagToken {
|
||||
inTitle = true
|
||||
continue
|
||||
}
|
||||
|
||||
if tagName == "link" && hasAttr {
|
||||
icon := parseLinkTag(tokenizer, protocol, hostname)
|
||||
if icon != nil {
|
||||
result.Icons = append(result.Icons, *icon)
|
||||
}
|
||||
}
|
||||
|
||||
// Stop parsing after </head> to save time — icons and title are in <head>
|
||||
if tagName == "body" {
|
||||
result.Title = cleanTitle(titleBuilder.String())
|
||||
return result
|
||||
}
|
||||
|
||||
case html.EndTagToken:
|
||||
tn, _ := tokenizer.TagName()
|
||||
if string(tn) == "title" {
|
||||
inTitle = false
|
||||
}
|
||||
|
||||
case html.TextToken:
|
||||
if inTitle {
|
||||
titleBuilder.Write(tokenizer.Text())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseLinkTag extracts icon info from a <link> tag if it's a favicon.
|
||||
func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon {
|
||||
var rel, href, typ, sizes string
|
||||
|
||||
for {
|
||||
key, val, more := tokenizer.TagAttr()
|
||||
k := string(key)
|
||||
v := string(val)
|
||||
|
||||
switch k {
|
||||
case "rel":
|
||||
rel = strings.ToLower(v)
|
||||
case "href":
|
||||
href = v
|
||||
case "type":
|
||||
typ = strings.ToLower(v)
|
||||
case "sizes":
|
||||
sizes = strings.ToLower(v)
|
||||
}
|
||||
|
||||
if !more {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Only interested in icon links
|
||||
if !strings.Contains(rel, "icon") {
|
||||
return nil
|
||||
}
|
||||
|
||||
if href == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
resolvedURL := resolveURL(href, protocol, hostname)
|
||||
if resolvedURL == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &Icon{
|
||||
URL: resolvedURL,
|
||||
Source: "link_rel",
|
||||
RelType: typ,
|
||||
RelSizes: sizes,
|
||||
}
|
||||
}
|
||||
|
||||
// resolveURL resolves a potentially relative icon URL against the host's base URL.
|
||||
func resolveURL(href, protocol, hostname string) string {
|
||||
href = strings.TrimSpace(href)
|
||||
if href == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Skip data: URIs
|
||||
if strings.HasPrefix(href, "data:") {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Already absolute
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
|
||||
// Protocol-relative
|
||||
if strings.HasPrefix(href, "//") {
|
||||
return protocol + ":" + href
|
||||
}
|
||||
|
||||
// Relative to root
|
||||
base := protocol + "://" + hostname
|
||||
if strings.HasPrefix(href, "/") {
|
||||
return base + href
|
||||
}
|
||||
|
||||
// Relative path — resolve against root
|
||||
parsed, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
baseParsed, err := url.Parse(base + "/")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return baseParsed.ResolveReference(parsed).String()
|
||||
}
|
||||
|
||||
// cleanTitle trims whitespace and truncates to 512 chars.
|
||||
func cleanTitle(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
// Collapse internal whitespace
|
||||
fields := strings.Fields(s)
|
||||
s = strings.Join(fields, " ")
|
||||
if len(s) > 512 {
|
||||
s = s[:512]
|
||||
}
|
||||
return s
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue