54 lines
1.5 KiB
Go
54 lines
1.5 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html/charset"
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// processHost fetches and parses a single host's WARC record.
|
|
func processHost(host Host) ProcessResult {
|
|
warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength))
|
|
if err != nil {
|
|
return ProcessResult{Err: err, FetchErr: true}
|
|
}
|
|
|
|
// Check iframe headers
|
|
iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders)
|
|
|
|
// Convert body to UTF-8 based on Content-Type header and HTML meta
|
|
contentType := warcResult.HTTPHeaders.Get("Content-Type")
|
|
body := toUTF8(warcResult.Body, contentType)
|
|
|
|
// Parse HTML for title and icons
|
|
parsed := ParseHTML(body, host.Protocol, host.Hostname)
|
|
|
|
// Sanitize title — strip any remaining invalid UTF-8 bytes
|
|
// (handles pages that lie about encoding or have truncated sequences)
|
|
title := strings.ToValidUTF8(parsed.Title, "")
|
|
|
|
return ProcessResult{
|
|
Title: title,
|
|
IframeAllowed: iframeAllowed,
|
|
Icons: parsed.Icons,
|
|
}
|
|
}
|
|
|
|
// toUTF8 detects the encoding of the HTML body and converts to UTF-8.
|
|
func toUTF8(body []byte, contentType string) []byte {
|
|
// DetermineEncoding checks Content-Type header and <meta> tags
|
|
encoding, _, _ := charset.DetermineEncoding(body, contentType)
|
|
if encoding == nil {
|
|
return body
|
|
}
|
|
|
|
reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder())
|
|
utf8Body, err := io.ReadAll(reader)
|
|
if err != nil {
|
|
return body
|
|
}
|
|
return utf8Body
|
|
}
|