package main import ( "context" "fmt" "github.com/jackc/pgx/v5/pgxpool" ) // Host represents a row from the hosts table. type Host struct { ID int64 Hostname string Protocol string WarcFilename string WarcRecordOffset int64 WarcRecordLength int } // ProcessResult holds everything extracted from one host's WARC record. type ProcessResult struct { Title string IframeAllowed bool Icons []Icon Err error FetchErr bool // true if error was during fetch (vs parse) } // WriteErrors tracks errors encountered during DB writes. type WriteErrors struct { HostUpdate int IconInsert int } // fetchBatch gets the next batch of unparsed hosts after lastID. func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]Host, error) { rows, err := pool.Query(ctx, `SELECT id, hostname, protocol, warc_filename, warc_record_offset, warc_record_length FROM hosts WHERE parsed = FALSE AND id > $1 ORDER BY id LIMIT $2`, lastID, limit) if err != nil { return nil, err } defer rows.Close() var hosts []Host for rows.Next() { var h Host err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.WarcFilename, &h.WarcRecordOffset, &h.WarcRecordLength) if err != nil { return nil, err } hosts = append(hosts, h) } return hosts, rows.Err() } // writeResult writes parsed results back to the database. // Returns counts of DB write errors encountered. func writeResult(ctx context.Context, pool *pgxpool.Pool, host Host, result ProcessResult, logWriter *LogWriter) WriteErrors { var errs WriteErrors // Update hosts table _, err := pool.Exec(ctx, `UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`, nilIfEmpty(result.Title), result.IframeAllowed, host.ID) if err != nil { errs.HostUpdate++ logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", host.Hostname, err) fmt.Println(logLine) if logWriter != nil { logWriter.Write(logLine, true) } return errs } // Insert /favicon.ico entry faviconURL := fmt.Sprintf("%s://%s/favicon.ico", host.Protocol, host.Hostname) _, err = pool.Exec(ctx, `INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`, host.ID, faviconURL) if err != nil { errs.IconInsert++ logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err) fmt.Println(logLine) if logWriter != nil { logWriter.Write(logLine, true) } } // Insert link rel="icon" entries for _, icon := range result.Icons { _, err = pool.Exec(ctx, `INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`, host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes)) if err != nil { errs.IconInsert++ logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err) fmt.Println(logLine) if logWriter != nil { logWriter.Write(logLine, true) } } } return errs } func nilIfEmpty(s string) *string { if s == "" { return nil } return &s }