package main import ( "bytes" "io" "strings" "golang.org/x/net/html/charset" "golang.org/x/text/transform" ) // processHost fetches and parses a single host's WARC record. func processHost(host Host) ProcessResult { warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength)) if err != nil { return ProcessResult{Err: err, FetchErr: true} } // Check iframe headers iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders) // Convert body to UTF-8 based on Content-Type header and HTML meta contentType := warcResult.HTTPHeaders.Get("Content-Type") body := toUTF8(warcResult.Body, contentType) // Parse HTML for title and icons parsed := ParseHTML(body, host.Protocol, host.Hostname) // Sanitize title — strip any remaining invalid UTF-8 bytes // (handles pages that lie about encoding or have truncated sequences) title := strings.ToValidUTF8(parsed.Title, "") return ProcessResult{ Title: title, IframeAllowed: iframeAllowed, Icons: parsed.Icons, } } // toUTF8 detects the encoding of the HTML body and converts to UTF-8. func toUTF8(body []byte, contentType string) []byte { // DetermineEncoding checks Content-Type header and tags encoding, _, _ := charset.DetermineEncoding(body, contentType) if encoding == nil { return body } reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder()) utf8Body, err := io.ReadAll(reader) if err != nil { return body } return utf8Body }