diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 332a451..8ebae4c 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -96,7 +96,7 @@ Icons are stored on local disk during scanning, not S3. The EBS volume holds the | warc_record_length | INT NOT NULL | Length of WARC record | | html_title | TEXT | Extracted from `` tag | | iframe_allowed | BOOLEAN | True if site allows framing | -| best_icon_s3_key | TEXT | SHA-256 hash of the chosen icon file (denormalized for fast bundle gen) | +| best_icon_hash | TEXT | SHA-256 hash of the chosen icon file (denormalized for fast bundle gen) | | parsed | BOOLEAN DEFAULT FALSE | Whether WARC has been parsed | | random_order | DOUBLE PRECISION DEFAULT random() | Random value for shuffled bundle generation pagination | @@ -114,7 +114,7 @@ Icons are stored on local disk during scanning, not S3. The EBS volume holds the | width | INT | Best usable pixel width (for ICO: largest standard size ≤64; for SVG: NULL) | | height | INT | Best usable pixel height (for ICO: largest standard size ≤64; for SVG: NULL) | | file_size | INT | Size in bytes | -| s3_key | TEXT | SHA-256 hash of content (used as local file path, legacy column name) | +| icon_hash | TEXT | SHA-256 hash of content (used as local file path: `ab/cd/ef/{hash}`) | | scan_state | TEXT DEFAULT 'unscanned' | `unscanned`, `in_progress`, `completed`, `failed` | | error | TEXT | Error message if failed | | downloaded_at | TIMESTAMPTZ | When the icon was fetched (NULL if not yet downloaded) | @@ -251,7 +251,7 @@ WHERE url_path = '/' - SVG: store width=NULL, height=NULL (vector, no pixel size) - Compute SHA-256 of content - Write to local disk at `{icons_dir}/ab/cd/ef/{sha256}` (skip if file already exists — dedup) - - Update icons row: s3_key (the SHA-256 hash), content_type (from actual data, not HTTP header), width, height, file_size, scan_state = 'completed' + - Update icons row: icon_hash (the SHA-256 hash), content_type (from actual data, not HTTP header), width, height, file_size, scan_state = 'completed' - On failure: scan_state = 'failed', error = reason **Concurrency:** Channel-based worker pool (default 2500 workers, configurable). Producer goroutine feeds a buffered channel (buffer = batch size), shuffles each batch to avoid hitting the same host back-to-back. N workers consume from the channel. @@ -366,7 +366,7 @@ Each pipeline stage has different bottlenecks. Understanding these explains the - **Memory is the concurrency limit** — each goroutine holds a TCP connection + TLS session + icon data buffer. At 5000 workers on c5.2xlarge (16GB), ~2-3GB for connection overhead — comfortable. - **Disk I/O is negligible** — icons are small (median ~5KB), writes are sharded across directories. - **DNS is cached** — Unbound's aggressive caching (1.7GB cache, 3600s min-TTL) means repeat TLD/nameserver lookups are instant. First-seen domains incur recursive resolution (~50-100ms) but this is pipelined with the HTTP request. -- **Measured: 439 icons/sec** at concurrency 1000 on c5.xlarge. Expected to improve significantly at 5000 concurrency on c5.2xlarge. +- **Measured: 2,136 icons/sec** at concurrency 5000 on c5.2xlarge (up from 439/sec at 1000 concurrency on c5.xlarge). CPU-bound at 90%. ### Stage 4: Best Icon Selection - **CPU-bound (Postgres).** Single SQL query with `DISTINCT ON` and multi-column sort. Runs in seconds even at 30M — Postgres handles this efficiently with the `idx_icons_host_id` index. @@ -591,4 +591,4 @@ If the site gets significant traffic beyond CloudFront free tier, costs scale wi 9. **Per-millisecond random seed** — Every visitor sees a unique arrangement. No shared state, no server needed for randomization. 10. **Viewport-sized bundles** — ~100-150 tabs per bundle, tuned to fill a screen. Faster loads, smaller memory footprint than 1MB bundles. 11. **Include no-icon hosts** — Any host with a title is included. Firefox-style rendering (title only) for hosts without favicons. -12. **Denormalized best_icon_s3_key in hosts** — Stores the SHA-256 hash of the chosen icon. Avoids joins during bundle generation. Written once during icon selection, read once during bundling. +12. **Denormalized best_icon_hash in hosts** — Stores the SHA-256 hash of the chosen icon. Avoids joins during bundle generation. Written once during icon selection, read once during bundling. diff --git a/pipeline/01_cc_index/schema.sql b/pipeline/01_cc_index/schema.sql index 154e521..0387c05 100644 --- a/pipeline/01_cc_index/schema.sql +++ b/pipeline/01_cc_index/schema.sql @@ -12,7 +12,7 @@ CREATE TABLE IF NOT EXISTS hosts ( warc_record_length INT NOT NULL, html_title TEXT, iframe_allowed BOOLEAN, - best_icon_s3_key TEXT, + best_icon_hash TEXT, parsed BOOLEAN DEFAULT FALSE, random_order DOUBLE PRECISION DEFAULT random() ); @@ -28,7 +28,7 @@ CREATE TABLE IF NOT EXISTS icons ( width INT, height INT, file_size INT, - s3_key TEXT, + icon_hash TEXT, scan_state TEXT DEFAULT 'unscanned', error TEXT, downloaded_at TIMESTAMPTZ diff --git a/pipeline/03_icon_download/db.go b/pipeline/03_icon_download/db.go index d5ec91d..91960ab 100644 --- a/pipeline/03_icon_download/db.go +++ b/pipeline/03_icon_download/db.go @@ -43,7 +43,7 @@ func claimBatch(ctx context.Context, pool *pgxpool.Pool, limit int) ([]IconRow, // DownloadResult holds the outcome of downloading one icon. type DownloadResult struct { - S3Key string + IconHash string ContentType string Width int Height int @@ -65,14 +65,14 @@ func updateIcon(ctx context.Context, pool *pgxpool.Pool, iconID int64, result Do _, err := pool.Exec(ctx, ` UPDATE icons SET scan_state = 'completed', - s3_key = $1, + icon_hash = $1, content_type = $2, width = $3, height = $4, file_size = $5, downloaded_at = now() WHERE id = $6`, - result.S3Key, result.ContentType, + result.IconHash, result.ContentType, nilIntIf(result.Width, 0), nilIntIf(result.Height, 0), result.FileSize, iconID) return err diff --git a/pipeline/03_icon_download/download.go b/pipeline/03_icon_download/download.go index 4e5b8e7..8734dff 100644 --- a/pipeline/03_icon_download/download.go +++ b/pipeline/03_icon_download/download.go @@ -37,22 +37,22 @@ func processIcon(icon IconRow, cfg Config) DownloadResult { // Compute SHA-256 for content-addressed storage hash := sha256.Sum256(data) - s3Key := hex.EncodeToString(hash[:]) + iconHash := hex.EncodeToString(hash[:]) // Write to disk (skip if already exists — dedup) dedup := false if !cfg.DryRun { - if iconExists(s3Key) { + if iconExists(iconHash) { dedup = true } else { - if err := iconWrite(s3Key, data); err != nil { + if err := iconWrite(iconHash, data); err != nil { return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"} } } } return DownloadResult{ - S3Key: s3Key, + IconHash: iconHash, ContentType: contentType, Width: width, Height: height, diff --git a/pipeline/04_best_icon/select.sql b/pipeline/04_best_icon/select.sql index 2e7dbea..ac1dda6 100644 --- a/pipeline/04_best_icon/select.sql +++ b/pipeline/04_best_icon/select.sql @@ -1,5 +1,5 @@ -- Best Icon Selection --- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key. +-- Picks the best completed icon for each host and stores its icon_hash in hosts.best_icon_hash. -- -- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina). -- @@ -11,7 +11,7 @@ -- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels). -- -- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE. --- Requires idx_icons_best covering index on (host_id) INCLUDE (s3_key, content_type, width, height, file_size). +-- Requires idx_icons_best covering index on (host_id) INCLUDE (icon_hash, content_type, width, height, file_size). -- -- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql @@ -21,10 +21,10 @@ SET work_mem = '2GB'; \timing on CREATE TEMP TABLE best_icons AS -SELECT DISTINCT ON (host_id) host_id, s3_key +SELECT DISTINCT ON (host_id) host_id, icon_hash FROM icons WHERE scan_state = 'completed' - AND s3_key IS NOT NULL + AND icon_hash IS NOT NULL AND content_type != 'image/svg+xml' AND (width IS NULL OR width > 2) AND (height IS NULL OR height > 2) @@ -50,7 +50,7 @@ ORDER BY host_id, \echo 'Step 2: Updating hosts...' -UPDATE hosts h SET best_icon_s3_key = b.s3_key +UPDATE hosts h SET best_icon_hash = b.icon_hash FROM best_icons b WHERE h.id = b.host_id; \timing off @@ -61,10 +61,10 @@ DROP TABLE best_icons; \echo '--- Best Icon Selection Stats ---' SELECT - COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon, - COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon, + COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL) AS hosts_with_icon, + COUNT(*) FILTER (WHERE best_icon_hash IS NULL) AS hosts_without_icon, COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title, - COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon + COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) AS title_but_no_icon FROM hosts WHERE parsed = TRUE; @@ -74,11 +74,11 @@ WHERE parsed = TRUE; \a \o stats/04_best_icon.json SELECT json_build_object( - 'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL), - 'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL), + 'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL), + 'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NULL), 'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL), 'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL), - 'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) + 'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) ) FROM hosts WHERE parsed = TRUE; diff --git a/pipeline/05_bundle_gen/bundle.go b/pipeline/05_bundle_gen/bundle.go index 274f4ec..4692c8a 100644 --- a/pipeline/05_bundle_gen/bundle.go +++ b/pipeline/05_bundle_gen/bundle.go @@ -31,11 +31,11 @@ func buildEntry(host HostRow, iconsDir string, logWriter *LogWriter, stats *Stat IframeOk: host.IframeAllowed, } - if host.BestIconS3Key == "" { + if host.BestIconHash == "" { return entry } - encoded, w, h, convertErr := safeConvert(host.BestIconS3Key, iconsDir) + encoded, w, h, convertErr := safeConvert(host.BestIconHash, iconsDir) if convertErr != "" { stats.ConvertErrors.Add(1) logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr) diff --git a/pipeline/05_bundle_gen/db.go b/pipeline/05_bundle_gen/db.go index 2536158..7328203 100644 --- a/pipeline/05_bundle_gen/db.go +++ b/pipeline/05_bundle_gen/db.go @@ -12,14 +12,14 @@ type HostRow struct { Protocol string HtmlTitle string IframeAllowed bool - BestIconS3Key string + BestIconHash string RandomOrder float64 } // fetchHostsPage gets a page of hosts with titles, ordered by random_order for shuffled bundles. func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, limit int) ([]HostRow, error) { rows, err := pool.Query(ctx, ` - SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_s3_key, ''), random_order + SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order FROM hosts WHERE html_title IS NOT NULL AND random_order > $1 ORDER BY random_order @@ -33,7 +33,7 @@ func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, var hosts []HostRow for rows.Next() { var h HostRow - if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key, &h.RandomOrder); err != nil { + if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder); err != nil { return nil, err } hosts = append(hosts, h) diff --git a/pipeline/05_bundle_gen/main.go b/pipeline/05_bundle_gen/main.go index 0f39643..3fdb913 100644 --- a/pipeline/05_bundle_gen/main.go +++ b/pipeline/05_bundle_gen/main.go @@ -93,7 +93,7 @@ func main() { if err != nil { log.Fatalf("Failed to count hosts: %v", err) } - err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_s3_key IS NOT NULL`).Scan(&hostsWithIcon) + err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_hash IS NOT NULL`).Scan(&hostsWithIcon) if err != nil { log.Fatalf("Failed to count icons: %v", err) }