added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
117
pipeline/02_warc_parse/db.go
Normal file
117
pipeline/02_warc_parse/db.go
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
// Host represents a row from the hosts table.
|
||||
type Host struct {
|
||||
ID int64
|
||||
Hostname string
|
||||
Protocol string
|
||||
WarcFilename string
|
||||
WarcRecordOffset int64
|
||||
WarcRecordLength int
|
||||
}
|
||||
|
||||
// ProcessResult holds everything extracted from one host's WARC record.
|
||||
type ProcessResult struct {
|
||||
Title string
|
||||
IframeAllowed bool
|
||||
Icons []Icon
|
||||
Err error
|
||||
FetchErr bool // true if error was during fetch (vs parse)
|
||||
}
|
||||
|
||||
// WriteErrors tracks errors encountered during DB writes.
|
||||
type WriteErrors struct {
|
||||
HostUpdate int
|
||||
IconInsert int
|
||||
}
|
||||
|
||||
// fetchBatch gets the next batch of unparsed hosts after lastID.
|
||||
func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]Host, error) {
|
||||
rows, err := pool.Query(ctx,
|
||||
`SELECT id, hostname, protocol, warc_filename, warc_record_offset, warc_record_length
|
||||
FROM hosts
|
||||
WHERE parsed = FALSE AND id > $1
|
||||
ORDER BY id
|
||||
LIMIT $2`,
|
||||
lastID, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var hosts []Host
|
||||
for rows.Next() {
|
||||
var h Host
|
||||
err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.WarcFilename, &h.WarcRecordOffset, &h.WarcRecordLength)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
hosts = append(hosts, h)
|
||||
}
|
||||
return hosts, rows.Err()
|
||||
}
|
||||
|
||||
// writeResult writes parsed results back to the database.
|
||||
// Returns counts of DB write errors encountered.
|
||||
func writeResult(ctx context.Context, pool *pgxpool.Pool, host Host, result ProcessResult, logWriter *LogWriter) WriteErrors {
|
||||
var errs WriteErrors
|
||||
|
||||
// Update hosts table
|
||||
_, err := pool.Exec(ctx,
|
||||
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
|
||||
nilIfEmpty(result.Title), result.IframeAllowed, host.ID)
|
||||
if err != nil {
|
||||
errs.HostUpdate++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
return errs
|
||||
}
|
||||
|
||||
// Insert /favicon.ico entry
|
||||
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", host.Protocol, host.Hostname)
|
||||
_, err = pool.Exec(ctx,
|
||||
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
|
||||
host.ID, faviconURL)
|
||||
if err != nil {
|
||||
errs.IconInsert++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
|
||||
// Insert link rel="icon" entries
|
||||
for _, icon := range result.Icons {
|
||||
_, err = pool.Exec(ctx,
|
||||
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
|
||||
host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes))
|
||||
if err != nil {
|
||||
errs.IconInsert++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func nilIfEmpty(s string) *string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
return &s
|
||||
}
|
||||
94
pipeline/02_warc_parse/log.go
Normal file
94
pipeline/02_warc_parse/log.go
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LogWriter handles writing log lines to a file.
|
||||
type LogWriter struct {
|
||||
file *os.File
|
||||
mu sync.Mutex
|
||||
errorsOnly bool
|
||||
}
|
||||
|
||||
func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) {
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &LogWriter{file: f, errorsOnly: errorsOnly}, nil
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Write(line string, isError bool) {
|
||||
if lw.errorsOnly && !isError {
|
||||
return
|
||||
}
|
||||
lw.mu.Lock()
|
||||
defer lw.mu.Unlock()
|
||||
fmt.Fprintln(lw.file, line)
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Close() error {
|
||||
return lw.file.Close()
|
||||
}
|
||||
|
||||
// formatLogLine creates a concise one-line log for a processed host.
|
||||
func formatLogLine(host Host, result ProcessResult) string {
|
||||
title := result.Title
|
||||
if len(title) > 20 {
|
||||
title = title[:20] + "..."
|
||||
}
|
||||
|
||||
if result.Err != nil {
|
||||
errType := "parse"
|
||||
if result.FetchErr {
|
||||
errType = "fetch"
|
||||
}
|
||||
return fmt.Sprintf("parsed: %s err:%s %v", host.Hostname, errType, result.Err)
|
||||
}
|
||||
|
||||
iconCount := len(result.Icons) + 1 // +1 for /favicon.ico
|
||||
iframe := "iframe:ok"
|
||||
if !result.IframeAllowed {
|
||||
iframe = "iframe:no"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("parsed: %s \"%s\" icons:%d %s", host.Hostname, title, iconCount, iframe)
|
||||
}
|
||||
|
||||
// writeStats writes the stage stats to a JSON file.
|
||||
func writeStats(stats *Stats, cfg Config) {
|
||||
finishedAt := time.Now()
|
||||
duration := finishedAt.Sub(stats.StartedAt)
|
||||
|
||||
data := map[string]interface{}{
|
||||
"started_at": stats.StartedAt.Format(time.RFC3339),
|
||||
"finished_at": finishedAt.Format(time.RFC3339),
|
||||
"duration_seconds": int(duration.Seconds()),
|
||||
"processed": stats.Processed.Load(),
|
||||
"titles_found": stats.TitlesFound.Load(),
|
||||
"icons_found": stats.IconsFound.Load(),
|
||||
"iframe_blocked": stats.IframeBlocked.Load(),
|
||||
"fetch_errors": stats.FetchErrors.Load(),
|
||||
"parse_errors": stats.ParseErrors.Load(),
|
||||
"db_errors": stats.DBErrors.Load(),
|
||||
"panics": stats.Panics.Load(),
|
||||
}
|
||||
|
||||
os.MkdirAll("stats", 0755)
|
||||
f, err := os.Create("stats/02_warc_parse.json")
|
||||
if err != nil {
|
||||
fmt.Printf("Failed to write stats: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
enc.Encode(data)
|
||||
fmt.Println("Stats written to stats/02_warc_parse.json")
|
||||
}
|
||||
207
pipeline/02_warc_parse/main.go
Normal file
207
pipeline/02_warc_parse/main.go
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
DBUrl string
|
||||
BatchSize int
|
||||
Concurrency int
|
||||
Limit int
|
||||
DryRun bool
|
||||
LogFile string
|
||||
LogErrors bool
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
Processed atomic.Int64
|
||||
TitlesFound atomic.Int64
|
||||
IconsFound atomic.Int64
|
||||
IframeBlocked atomic.Int64
|
||||
ParseErrors atomic.Int64
|
||||
FetchErrors atomic.Int64
|
||||
DBErrors atomic.Int64
|
||||
Panics atomic.Int64
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
func main() {
|
||||
cfg := Config{}
|
||||
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
|
||||
flag.IntVar(&cfg.BatchSize, "batch-size", 500, "Rows to fetch per batch")
|
||||
flag.IntVar(&cfg.Concurrency, "concurrency", 100, "Number of concurrent goroutines")
|
||||
flag.IntVar(&cfg.Limit, "limit", 0, "Max rows to process (0 = all)")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Print results without writing to DB")
|
||||
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
|
||||
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
|
||||
flag.Parse()
|
||||
|
||||
if cfg.DBUrl == "" {
|
||||
fmt.Println("Usage: warc_parse --db DATABASE_URL [OPTIONS]")
|
||||
flag.PrintDefaults()
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Init S3 client
|
||||
if err := initS3(); err != nil {
|
||||
log.Fatalf("Failed to init S3: %v", err)
|
||||
}
|
||||
|
||||
pool, err := pgxpool.New(ctx, cfg.DBUrl)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
defer pool.Close()
|
||||
|
||||
// Get total count
|
||||
var total int64
|
||||
if cfg.Limit > 0 {
|
||||
total = int64(cfg.Limit)
|
||||
} else {
|
||||
err = pool.QueryRow(ctx, "SELECT COUNT(*) FROM hosts WHERE parsed = FALSE").Scan(&total)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to count unparsed hosts: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
fmt.Println("No unparsed hosts found.")
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("=== WARC Parser ===\n")
|
||||
fmt.Printf("Unparsed hosts: %d\n", total)
|
||||
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
|
||||
fmt.Printf("Batch size: %d\n", cfg.BatchSize)
|
||||
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
|
||||
|
||||
// Setup log file
|
||||
var logWriter *LogWriter
|
||||
if cfg.LogFile != "" {
|
||||
logWriter, err = NewLogWriter(cfg.LogFile, cfg.LogErrors)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to open log file: %v", err)
|
||||
}
|
||||
defer logWriter.Close()
|
||||
}
|
||||
|
||||
stats := &Stats{StartedAt: time.Now()}
|
||||
|
||||
|
||||
// Worker pool
|
||||
sem := make(chan struct{}, cfg.Concurrency)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Process in batches
|
||||
var lastID int64
|
||||
processed := 0
|
||||
|
||||
for {
|
||||
if cfg.Limit > 0 && processed >= cfg.Limit {
|
||||
break
|
||||
}
|
||||
|
||||
batchLimit := cfg.BatchSize
|
||||
if cfg.Limit > 0 && processed+batchLimit > cfg.Limit {
|
||||
batchLimit = cfg.Limit - processed
|
||||
}
|
||||
|
||||
hosts, err := fetchBatch(ctx, pool, lastID, batchLimit)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to fetch batch: %v", err)
|
||||
}
|
||||
if len(hosts) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
lastID = hosts[len(hosts)-1].ID
|
||||
|
||||
for i := range hosts {
|
||||
host := hosts[i]
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
|
||||
// Recover from panics — log them, don't mark row as parsed
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
stats.Panics.Add(1)
|
||||
stats.Processed.Add(1)
|
||||
logLine := fmt.Sprintf("PANIC: %s %v", host.Hostname, r)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
result := processHost(host)
|
||||
|
||||
// Log line
|
||||
logLine := formatLogLine(host, result)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, result.Err != nil)
|
||||
}
|
||||
|
||||
// Write to DB
|
||||
if !cfg.DryRun && result.Err == nil {
|
||||
errs := writeResult(ctx, pool, host, result, logWriter)
|
||||
stats.DBErrors.Add(int64(errs.HostUpdate + errs.IconInsert))
|
||||
}
|
||||
|
||||
// Update stats
|
||||
stats.Processed.Add(1)
|
||||
if result.Title != "" {
|
||||
stats.TitlesFound.Add(1)
|
||||
}
|
||||
stats.IconsFound.Add(int64(len(result.Icons)))
|
||||
if !result.IframeAllowed {
|
||||
stats.IframeBlocked.Add(1)
|
||||
}
|
||||
if result.Err != nil {
|
||||
if result.FetchErr {
|
||||
stats.FetchErrors.Add(1)
|
||||
} else {
|
||||
stats.ParseErrors.Add(1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
processed += len(hosts)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Print summary
|
||||
duration := time.Since(stats.StartedAt)
|
||||
fmt.Printf("\n=== Summary ===\n")
|
||||
fmt.Printf("Duration: %s\n", duration.Round(time.Second))
|
||||
fmt.Printf("Processed: %d\n", stats.Processed.Load())
|
||||
fmt.Printf("Titles found: %d\n", stats.TitlesFound.Load())
|
||||
fmt.Printf("Icons found: %d\n", stats.IconsFound.Load())
|
||||
fmt.Printf("Iframe blocked: %d\n", stats.IframeBlocked.Load())
|
||||
fmt.Printf("Fetch errors: %d\n", stats.FetchErrors.Load())
|
||||
fmt.Printf("Parse errors: %d\n", stats.ParseErrors.Load())
|
||||
fmt.Printf("DB errors: %d\n", stats.DBErrors.Load())
|
||||
fmt.Printf("Panics: %d\n", stats.Panics.Load())
|
||||
|
||||
// Write stats JSON
|
||||
writeStats(stats, cfg)
|
||||
}
|
||||
175
pipeline/02_warc_parse/parser.go
Normal file
175
pipeline/02_warc_parse/parser.go
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Icon represents a discovered favicon link.
|
||||
type Icon struct {
|
||||
URL string
|
||||
Source string // "favicon_ico" or "link_rel"
|
||||
RelType string // type attribute from <link> (e.g., "image/png")
|
||||
RelSizes string // sizes attribute from <link> (e.g., "32x32")
|
||||
}
|
||||
|
||||
// ParseResult holds extracted data from HTML parsing.
|
||||
type ParseResult struct {
|
||||
Title string
|
||||
Icons []Icon
|
||||
}
|
||||
|
||||
// ParseHTML extracts the title and link rel="icon" tags from HTML.
|
||||
// Uses a lenient tokenizer approach that handles malformed HTML.
|
||||
func ParseHTML(body []byte, protocol, hostname string) ParseResult {
|
||||
result := ParseResult{}
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(string(body)))
|
||||
|
||||
inTitle := false
|
||||
var titleBuilder strings.Builder
|
||||
|
||||
for {
|
||||
tt := tokenizer.Next()
|
||||
switch tt {
|
||||
case html.ErrorToken:
|
||||
// End of document or parse error — return what we have
|
||||
result.Title = cleanTitle(titleBuilder.String())
|
||||
return result
|
||||
|
||||
case html.StartTagToken, html.SelfClosingTagToken:
|
||||
tn, hasAttr := tokenizer.TagName()
|
||||
tagName := string(tn)
|
||||
|
||||
if tagName == "title" && tt == html.StartTagToken {
|
||||
inTitle = true
|
||||
continue
|
||||
}
|
||||
|
||||
if tagName == "link" && hasAttr {
|
||||
icon := parseLinkTag(tokenizer, protocol, hostname)
|
||||
if icon != nil {
|
||||
result.Icons = append(result.Icons, *icon)
|
||||
}
|
||||
}
|
||||
|
||||
// Stop parsing after </head> to save time — icons and title are in <head>
|
||||
if tagName == "body" {
|
||||
result.Title = cleanTitle(titleBuilder.String())
|
||||
return result
|
||||
}
|
||||
|
||||
case html.EndTagToken:
|
||||
tn, _ := tokenizer.TagName()
|
||||
if string(tn) == "title" {
|
||||
inTitle = false
|
||||
}
|
||||
|
||||
case html.TextToken:
|
||||
if inTitle {
|
||||
titleBuilder.Write(tokenizer.Text())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseLinkTag extracts icon info from a <link> tag if it's a favicon.
|
||||
func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon {
|
||||
var rel, href, typ, sizes string
|
||||
|
||||
for {
|
||||
key, val, more := tokenizer.TagAttr()
|
||||
k := string(key)
|
||||
v := string(val)
|
||||
|
||||
switch k {
|
||||
case "rel":
|
||||
rel = strings.ToLower(v)
|
||||
case "href":
|
||||
href = v
|
||||
case "type":
|
||||
typ = strings.ToLower(v)
|
||||
case "sizes":
|
||||
sizes = strings.ToLower(v)
|
||||
}
|
||||
|
||||
if !more {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Only interested in icon links
|
||||
if !strings.Contains(rel, "icon") {
|
||||
return nil
|
||||
}
|
||||
|
||||
if href == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
resolvedURL := resolveURL(href, protocol, hostname)
|
||||
if resolvedURL == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &Icon{
|
||||
URL: resolvedURL,
|
||||
Source: "link_rel",
|
||||
RelType: typ,
|
||||
RelSizes: sizes,
|
||||
}
|
||||
}
|
||||
|
||||
// resolveURL resolves a potentially relative icon URL against the host's base URL.
|
||||
func resolveURL(href, protocol, hostname string) string {
|
||||
href = strings.TrimSpace(href)
|
||||
if href == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Skip data: URIs
|
||||
if strings.HasPrefix(href, "data:") {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Already absolute
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
|
||||
// Protocol-relative
|
||||
if strings.HasPrefix(href, "//") {
|
||||
return protocol + ":" + href
|
||||
}
|
||||
|
||||
// Relative to root
|
||||
base := protocol + "://" + hostname
|
||||
if strings.HasPrefix(href, "/") {
|
||||
return base + href
|
||||
}
|
||||
|
||||
// Relative path — resolve against root
|
||||
parsed, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
baseParsed, err := url.Parse(base + "/")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return baseParsed.ResolveReference(parsed).String()
|
||||
}
|
||||
|
||||
// cleanTitle trims whitespace and truncates to 512 chars.
|
||||
func cleanTitle(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
// Collapse internal whitespace
|
||||
fields := strings.Fields(s)
|
||||
s = strings.Join(fields, " ")
|
||||
if len(s) > 512 {
|
||||
s = s[:512]
|
||||
}
|
||||
return s
|
||||
}
|
||||
54
pipeline/02_warc_parse/process.go
Normal file
54
pipeline/02_warc_parse/process.go
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// processHost fetches and parses a single host's WARC record.
|
||||
func processHost(host Host) ProcessResult {
|
||||
warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength))
|
||||
if err != nil {
|
||||
return ProcessResult{Err: err, FetchErr: true}
|
||||
}
|
||||
|
||||
// Check iframe headers
|
||||
iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders)
|
||||
|
||||
// Convert body to UTF-8 based on Content-Type header and HTML meta
|
||||
contentType := warcResult.HTTPHeaders.Get("Content-Type")
|
||||
body := toUTF8(warcResult.Body, contentType)
|
||||
|
||||
// Parse HTML for title and icons
|
||||
parsed := ParseHTML(body, host.Protocol, host.Hostname)
|
||||
|
||||
// Sanitize title — strip any remaining invalid UTF-8 bytes
|
||||
// (handles pages that lie about encoding or have truncated sequences)
|
||||
title := strings.ToValidUTF8(parsed.Title, "")
|
||||
|
||||
return ProcessResult{
|
||||
Title: title,
|
||||
IframeAllowed: iframeAllowed,
|
||||
Icons: parsed.Icons,
|
||||
}
|
||||
}
|
||||
|
||||
// toUTF8 detects the encoding of the HTML body and converts to UTF-8.
|
||||
func toUTF8(body []byte, contentType string) []byte {
|
||||
// DetermineEncoding checks Content-Type header and <meta> tags
|
||||
encoding, _, _ := charset.DetermineEncoding(body, contentType)
|
||||
if encoding == nil {
|
||||
return body
|
||||
}
|
||||
|
||||
reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder())
|
||||
utf8Body, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return body
|
||||
}
|
||||
return utf8Body
|
||||
}
|
||||
126
pipeline/02_warc_parse/warc.go
Normal file
126
pipeline/02_warc_parse/warc.go
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/config"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/nlnwa/gowarc/v3"
|
||||
)
|
||||
|
||||
const ccBucket = "commoncrawl"
|
||||
|
||||
var s3Client *s3.Client
|
||||
|
||||
func initS3() error {
|
||||
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("load AWS config: %w", err)
|
||||
}
|
||||
s3Client = s3.NewFromConfig(cfg)
|
||||
return nil
|
||||
}
|
||||
|
||||
// WARCResult holds the extracted data from a WARC response record.
|
||||
type WARCResult struct {
|
||||
HTTPHeaders http.Header
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it.
|
||||
func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) {
|
||||
rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1)
|
||||
|
||||
resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{
|
||||
Bucket: aws.String(ccBucket),
|
||||
Key: aws.String(warcFilename),
|
||||
Range: aws.String(rangeHeader),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("s3 get: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Each WARC record is individually gzipped
|
||||
gzReader, err := gzip.NewReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gzip: %w", err)
|
||||
}
|
||||
defer gzReader.Close()
|
||||
|
||||
// Read all decompressed data into memory for gowarc
|
||||
decompressed, err := io.ReadAll(gzReader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("decompress: %w", err)
|
||||
}
|
||||
|
||||
// Parse WARC record using gowarc
|
||||
warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("warc reader: %w", err)
|
||||
}
|
||||
defer warcReader.Close()
|
||||
|
||||
rec, err := warcReader.Next()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read record: %w", err)
|
||||
}
|
||||
defer rec.Close()
|
||||
|
||||
if rec.WarcRecord.Type() != gowarc.Response {
|
||||
return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type())
|
||||
}
|
||||
|
||||
block := rec.WarcRecord.Block()
|
||||
httpBlock, ok := block.(gowarc.HttpResponseBlock)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("block is not HTTP response")
|
||||
}
|
||||
|
||||
// Get HTTP response headers
|
||||
var httpHeaders http.Header
|
||||
headers := httpBlock.HttpHeader()
|
||||
if headers != nil {
|
||||
httpHeaders = *headers
|
||||
} else {
|
||||
httpHeaders = make(http.Header)
|
||||
}
|
||||
|
||||
// Get HTTP body (the HTML)
|
||||
bodyReader, err := httpBlock.PayloadBytes()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("payload: %w", err)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(bodyReader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read body: %w", err)
|
||||
}
|
||||
|
||||
return &WARCResult{
|
||||
HTTPHeaders: httpHeaders,
|
||||
Body: body,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors.
|
||||
func CheckIframeAllowed(headers http.Header) bool {
|
||||
xfo := strings.ToLower(headers.Get("X-Frame-Options"))
|
||||
if xfo == "deny" || xfo == "sameorigin" {
|
||||
return false
|
||||
}
|
||||
|
||||
csp := strings.ToLower(headers.Get("Content-Security-Policy"))
|
||||
if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue