added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
54
pipeline/02_warc_parse/process.go
Normal file
54
pipeline/02_warc_parse/process.go
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// processHost fetches and parses a single host's WARC record.
|
||||
func processHost(host Host) ProcessResult {
|
||||
warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength))
|
||||
if err != nil {
|
||||
return ProcessResult{Err: err, FetchErr: true}
|
||||
}
|
||||
|
||||
// Check iframe headers
|
||||
iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders)
|
||||
|
||||
// Convert body to UTF-8 based on Content-Type header and HTML meta
|
||||
contentType := warcResult.HTTPHeaders.Get("Content-Type")
|
||||
body := toUTF8(warcResult.Body, contentType)
|
||||
|
||||
// Parse HTML for title and icons
|
||||
parsed := ParseHTML(body, host.Protocol, host.Hostname)
|
||||
|
||||
// Sanitize title — strip any remaining invalid UTF-8 bytes
|
||||
// (handles pages that lie about encoding or have truncated sequences)
|
||||
title := strings.ToValidUTF8(parsed.Title, "")
|
||||
|
||||
return ProcessResult{
|
||||
Title: title,
|
||||
IframeAllowed: iframeAllowed,
|
||||
Icons: parsed.Icons,
|
||||
}
|
||||
}
|
||||
|
||||
// toUTF8 detects the encoding of the HTML body and converts to UTF-8.
|
||||
func toUTF8(body []byte, contentType string) []byte {
|
||||
// DetermineEncoding checks Content-Type header and <meta> tags
|
||||
encoding, _, _ := charset.DetermineEncoding(body, contentType)
|
||||
if encoding == nil {
|
||||
return body
|
||||
}
|
||||
|
||||
reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder())
|
||||
utf8Body, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return body
|
||||
}
|
||||
return utf8Body
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue