added warc parser

This commit is contained in:
Joe Lothan 2026-05-17 20:25:59 -04:00
parent db81015e0b
commit f45e4a6034
8 changed files with 954 additions and 0 deletions

View file

@ -0,0 +1,117 @@
package main
import (
"context"
"fmt"
"github.com/jackc/pgx/v5/pgxpool"
)
// Host represents a row from the hosts table.
type Host struct {
ID int64
Hostname string
Protocol string
WarcFilename string
WarcRecordOffset int64
WarcRecordLength int
}
// ProcessResult holds everything extracted from one host's WARC record.
type ProcessResult struct {
Title string
IframeAllowed bool
Icons []Icon
Err error
FetchErr bool // true if error was during fetch (vs parse)
}
// WriteErrors tracks errors encountered during DB writes.
type WriteErrors struct {
HostUpdate int
IconInsert int
}
// fetchBatch gets the next batch of unparsed hosts after lastID.
func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]Host, error) {
rows, err := pool.Query(ctx,
`SELECT id, hostname, protocol, warc_filename, warc_record_offset, warc_record_length
FROM hosts
WHERE parsed = FALSE AND id > $1
ORDER BY id
LIMIT $2`,
lastID, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var hosts []Host
for rows.Next() {
var h Host
err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.WarcFilename, &h.WarcRecordOffset, &h.WarcRecordLength)
if err != nil {
return nil, err
}
hosts = append(hosts, h)
}
return hosts, rows.Err()
}
// writeResult writes parsed results back to the database.
// Returns counts of DB write errors encountered.
func writeResult(ctx context.Context, pool *pgxpool.Pool, host Host, result ProcessResult, logWriter *LogWriter) WriteErrors {
var errs WriteErrors
// Update hosts table
_, err := pool.Exec(ctx,
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
nilIfEmpty(result.Title), result.IframeAllowed, host.ID)
if err != nil {
errs.HostUpdate++
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", host.Hostname, err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
return errs
}
// Insert /favicon.ico entry
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", host.Protocol, host.Hostname)
_, err = pool.Exec(ctx,
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
host.ID, faviconURL)
if err != nil {
errs.IconInsert++
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
}
// Insert link rel="icon" entries
for _, icon := range result.Icons {
_, err = pool.Exec(ctx,
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes))
if err != nil {
errs.IconInsert++
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
}
}
return errs
}
func nilIfEmpty(s string) *string {
if s == "" {
return nil
}
return &s
}