added warc parser

This commit is contained in:
Joe Lothan 2026-05-17 20:25:59 -04:00
parent db81015e0b
commit f45e4a6034
8 changed files with 954 additions and 0 deletions

View file

@ -0,0 +1,54 @@
package main
import (
"bytes"
"io"
"strings"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
// processHost fetches and parses a single host's WARC record.
func processHost(host Host) ProcessResult {
warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength))
if err != nil {
return ProcessResult{Err: err, FetchErr: true}
}
// Check iframe headers
iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders)
// Convert body to UTF-8 based on Content-Type header and HTML meta
contentType := warcResult.HTTPHeaders.Get("Content-Type")
body := toUTF8(warcResult.Body, contentType)
// Parse HTML for title and icons
parsed := ParseHTML(body, host.Protocol, host.Hostname)
// Sanitize title — strip any remaining invalid UTF-8 bytes
// (handles pages that lie about encoding or have truncated sequences)
title := strings.ToValidUTF8(parsed.Title, "")
return ProcessResult{
Title: title,
IframeAllowed: iframeAllowed,
Icons: parsed.Icons,
}
}
// toUTF8 detects the encoding of the HTML body and converts to UTF-8.
func toUTF8(body []byte, contentType string) []byte {
// DetermineEncoding checks Content-Type header and <meta> tags
encoding, _, _ := charset.DetermineEncoding(body, contentType)
if encoding == nil {
return body
}
reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder())
utf8Body, err := io.ReadAll(reader)
if err != nil {
return body
}
return utf8Body
}