updated s3_key name to icon_hash
This commit is contained in:
parent
e308718eb2
commit
33bd0a221e
8 changed files with 31 additions and 31 deletions
|
|
@ -96,7 +96,7 @@ Icons are stored on local disk during scanning, not S3. The EBS volume holds the
|
||||||
| warc_record_length | INT NOT NULL | Length of WARC record |
|
| warc_record_length | INT NOT NULL | Length of WARC record |
|
||||||
| html_title | TEXT | Extracted from `<title>` tag |
|
| html_title | TEXT | Extracted from `<title>` tag |
|
||||||
| iframe_allowed | BOOLEAN | True if site allows framing |
|
| iframe_allowed | BOOLEAN | True if site allows framing |
|
||||||
| best_icon_s3_key | TEXT | SHA-256 hash of the chosen icon file (denormalized for fast bundle gen) |
|
| best_icon_hash | TEXT | SHA-256 hash of the chosen icon file (denormalized for fast bundle gen) |
|
||||||
| parsed | BOOLEAN DEFAULT FALSE | Whether WARC has been parsed |
|
| parsed | BOOLEAN DEFAULT FALSE | Whether WARC has been parsed |
|
||||||
| random_order | DOUBLE PRECISION DEFAULT random() | Random value for shuffled bundle generation pagination |
|
| random_order | DOUBLE PRECISION DEFAULT random() | Random value for shuffled bundle generation pagination |
|
||||||
|
|
||||||
|
|
@ -114,7 +114,7 @@ Icons are stored on local disk during scanning, not S3. The EBS volume holds the
|
||||||
| width | INT | Best usable pixel width (for ICO: largest standard size ≤64; for SVG: NULL) |
|
| width | INT | Best usable pixel width (for ICO: largest standard size ≤64; for SVG: NULL) |
|
||||||
| height | INT | Best usable pixel height (for ICO: largest standard size ≤64; for SVG: NULL) |
|
| height | INT | Best usable pixel height (for ICO: largest standard size ≤64; for SVG: NULL) |
|
||||||
| file_size | INT | Size in bytes |
|
| file_size | INT | Size in bytes |
|
||||||
| s3_key | TEXT | SHA-256 hash of content (used as local file path, legacy column name) |
|
| icon_hash | TEXT | SHA-256 hash of content (used as local file path: `ab/cd/ef/{hash}`) |
|
||||||
| scan_state | TEXT DEFAULT 'unscanned' | `unscanned`, `in_progress`, `completed`, `failed` |
|
| scan_state | TEXT DEFAULT 'unscanned' | `unscanned`, `in_progress`, `completed`, `failed` |
|
||||||
| error | TEXT | Error message if failed |
|
| error | TEXT | Error message if failed |
|
||||||
| downloaded_at | TIMESTAMPTZ | When the icon was fetched (NULL if not yet downloaded) |
|
| downloaded_at | TIMESTAMPTZ | When the icon was fetched (NULL if not yet downloaded) |
|
||||||
|
|
@ -251,7 +251,7 @@ WHERE url_path = '/'
|
||||||
- SVG: store width=NULL, height=NULL (vector, no pixel size)
|
- SVG: store width=NULL, height=NULL (vector, no pixel size)
|
||||||
- Compute SHA-256 of content
|
- Compute SHA-256 of content
|
||||||
- Write to local disk at `{icons_dir}/ab/cd/ef/{sha256}` (skip if file already exists — dedup)
|
- Write to local disk at `{icons_dir}/ab/cd/ef/{sha256}` (skip if file already exists — dedup)
|
||||||
- Update icons row: s3_key (the SHA-256 hash), content_type (from actual data, not HTTP header), width, height, file_size, scan_state = 'completed'
|
- Update icons row: icon_hash (the SHA-256 hash), content_type (from actual data, not HTTP header), width, height, file_size, scan_state = 'completed'
|
||||||
- On failure: scan_state = 'failed', error = reason
|
- On failure: scan_state = 'failed', error = reason
|
||||||
|
|
||||||
**Concurrency:** Channel-based worker pool (default 2500 workers, configurable). Producer goroutine feeds a buffered channel (buffer = batch size), shuffles each batch to avoid hitting the same host back-to-back. N workers consume from the channel.
|
**Concurrency:** Channel-based worker pool (default 2500 workers, configurable). Producer goroutine feeds a buffered channel (buffer = batch size), shuffles each batch to avoid hitting the same host back-to-back. N workers consume from the channel.
|
||||||
|
|
@ -366,7 +366,7 @@ Each pipeline stage has different bottlenecks. Understanding these explains the
|
||||||
- **Memory is the concurrency limit** — each goroutine holds a TCP connection + TLS session + icon data buffer. At 5000 workers on c5.2xlarge (16GB), ~2-3GB for connection overhead — comfortable.
|
- **Memory is the concurrency limit** — each goroutine holds a TCP connection + TLS session + icon data buffer. At 5000 workers on c5.2xlarge (16GB), ~2-3GB for connection overhead — comfortable.
|
||||||
- **Disk I/O is negligible** — icons are small (median ~5KB), writes are sharded across directories.
|
- **Disk I/O is negligible** — icons are small (median ~5KB), writes are sharded across directories.
|
||||||
- **DNS is cached** — Unbound's aggressive caching (1.7GB cache, 3600s min-TTL) means repeat TLD/nameserver lookups are instant. First-seen domains incur recursive resolution (~50-100ms) but this is pipelined with the HTTP request.
|
- **DNS is cached** — Unbound's aggressive caching (1.7GB cache, 3600s min-TTL) means repeat TLD/nameserver lookups are instant. First-seen domains incur recursive resolution (~50-100ms) but this is pipelined with the HTTP request.
|
||||||
- **Measured: 439 icons/sec** at concurrency 1000 on c5.xlarge. Expected to improve significantly at 5000 concurrency on c5.2xlarge.
|
- **Measured: 2,136 icons/sec** at concurrency 5000 on c5.2xlarge (up from 439/sec at 1000 concurrency on c5.xlarge). CPU-bound at 90%.
|
||||||
|
|
||||||
### Stage 4: Best Icon Selection
|
### Stage 4: Best Icon Selection
|
||||||
- **CPU-bound (Postgres).** Single SQL query with `DISTINCT ON` and multi-column sort. Runs in seconds even at 30M — Postgres handles this efficiently with the `idx_icons_host_id` index.
|
- **CPU-bound (Postgres).** Single SQL query with `DISTINCT ON` and multi-column sort. Runs in seconds even at 30M — Postgres handles this efficiently with the `idx_icons_host_id` index.
|
||||||
|
|
@ -591,4 +591,4 @@ If the site gets significant traffic beyond CloudFront free tier, costs scale wi
|
||||||
9. **Per-millisecond random seed** — Every visitor sees a unique arrangement. No shared state, no server needed for randomization.
|
9. **Per-millisecond random seed** — Every visitor sees a unique arrangement. No shared state, no server needed for randomization.
|
||||||
10. **Viewport-sized bundles** — ~100-150 tabs per bundle, tuned to fill a screen. Faster loads, smaller memory footprint than 1MB bundles.
|
10. **Viewport-sized bundles** — ~100-150 tabs per bundle, tuned to fill a screen. Faster loads, smaller memory footprint than 1MB bundles.
|
||||||
11. **Include no-icon hosts** — Any host with a title is included. Firefox-style rendering (title only) for hosts without favicons.
|
11. **Include no-icon hosts** — Any host with a title is included. Firefox-style rendering (title only) for hosts without favicons.
|
||||||
12. **Denormalized best_icon_s3_key in hosts** — Stores the SHA-256 hash of the chosen icon. Avoids joins during bundle generation. Written once during icon selection, read once during bundling.
|
12. **Denormalized best_icon_hash in hosts** — Stores the SHA-256 hash of the chosen icon. Avoids joins during bundle generation. Written once during icon selection, read once during bundling.
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ CREATE TABLE IF NOT EXISTS hosts (
|
||||||
warc_record_length INT NOT NULL,
|
warc_record_length INT NOT NULL,
|
||||||
html_title TEXT,
|
html_title TEXT,
|
||||||
iframe_allowed BOOLEAN,
|
iframe_allowed BOOLEAN,
|
||||||
best_icon_s3_key TEXT,
|
best_icon_hash TEXT,
|
||||||
parsed BOOLEAN DEFAULT FALSE,
|
parsed BOOLEAN DEFAULT FALSE,
|
||||||
random_order DOUBLE PRECISION DEFAULT random()
|
random_order DOUBLE PRECISION DEFAULT random()
|
||||||
);
|
);
|
||||||
|
|
@ -28,7 +28,7 @@ CREATE TABLE IF NOT EXISTS icons (
|
||||||
width INT,
|
width INT,
|
||||||
height INT,
|
height INT,
|
||||||
file_size INT,
|
file_size INT,
|
||||||
s3_key TEXT,
|
icon_hash TEXT,
|
||||||
scan_state TEXT DEFAULT 'unscanned',
|
scan_state TEXT DEFAULT 'unscanned',
|
||||||
error TEXT,
|
error TEXT,
|
||||||
downloaded_at TIMESTAMPTZ
|
downloaded_at TIMESTAMPTZ
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ func claimBatch(ctx context.Context, pool *pgxpool.Pool, limit int) ([]IconRow,
|
||||||
|
|
||||||
// DownloadResult holds the outcome of downloading one icon.
|
// DownloadResult holds the outcome of downloading one icon.
|
||||||
type DownloadResult struct {
|
type DownloadResult struct {
|
||||||
S3Key string
|
IconHash string
|
||||||
ContentType string
|
ContentType string
|
||||||
Width int
|
Width int
|
||||||
Height int
|
Height int
|
||||||
|
|
@ -65,14 +65,14 @@ func updateIcon(ctx context.Context, pool *pgxpool.Pool, iconID int64, result Do
|
||||||
_, err := pool.Exec(ctx, `
|
_, err := pool.Exec(ctx, `
|
||||||
UPDATE icons SET
|
UPDATE icons SET
|
||||||
scan_state = 'completed',
|
scan_state = 'completed',
|
||||||
s3_key = $1,
|
icon_hash = $1,
|
||||||
content_type = $2,
|
content_type = $2,
|
||||||
width = $3,
|
width = $3,
|
||||||
height = $4,
|
height = $4,
|
||||||
file_size = $5,
|
file_size = $5,
|
||||||
downloaded_at = now()
|
downloaded_at = now()
|
||||||
WHERE id = $6`,
|
WHERE id = $6`,
|
||||||
result.S3Key, result.ContentType,
|
result.IconHash, result.ContentType,
|
||||||
nilIntIf(result.Width, 0), nilIntIf(result.Height, 0),
|
nilIntIf(result.Width, 0), nilIntIf(result.Height, 0),
|
||||||
result.FileSize, iconID)
|
result.FileSize, iconID)
|
||||||
return err
|
return err
|
||||||
|
|
|
||||||
|
|
@ -37,22 +37,22 @@ func processIcon(icon IconRow, cfg Config) DownloadResult {
|
||||||
|
|
||||||
// Compute SHA-256 for content-addressed storage
|
// Compute SHA-256 for content-addressed storage
|
||||||
hash := sha256.Sum256(data)
|
hash := sha256.Sum256(data)
|
||||||
s3Key := hex.EncodeToString(hash[:])
|
iconHash := hex.EncodeToString(hash[:])
|
||||||
|
|
||||||
// Write to disk (skip if already exists — dedup)
|
// Write to disk (skip if already exists — dedup)
|
||||||
dedup := false
|
dedup := false
|
||||||
if !cfg.DryRun {
|
if !cfg.DryRun {
|
||||||
if iconExists(s3Key) {
|
if iconExists(iconHash) {
|
||||||
dedup = true
|
dedup = true
|
||||||
} else {
|
} else {
|
||||||
if err := iconWrite(s3Key, data); err != nil {
|
if err := iconWrite(iconHash, data); err != nil {
|
||||||
return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"}
|
return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return DownloadResult{
|
return DownloadResult{
|
||||||
S3Key: s3Key,
|
IconHash: iconHash,
|
||||||
ContentType: contentType,
|
ContentType: contentType,
|
||||||
Width: width,
|
Width: width,
|
||||||
Height: height,
|
Height: height,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
-- Best Icon Selection
|
-- Best Icon Selection
|
||||||
-- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key.
|
-- Picks the best completed icon for each host and stores its icon_hash in hosts.best_icon_hash.
|
||||||
--
|
--
|
||||||
-- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina).
|
-- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina).
|
||||||
--
|
--
|
||||||
|
|
@ -11,7 +11,7 @@
|
||||||
-- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels).
|
-- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels).
|
||||||
--
|
--
|
||||||
-- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE.
|
-- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE.
|
||||||
-- Requires idx_icons_best covering index on (host_id) INCLUDE (s3_key, content_type, width, height, file_size).
|
-- Requires idx_icons_best covering index on (host_id) INCLUDE (icon_hash, content_type, width, height, file_size).
|
||||||
--
|
--
|
||||||
-- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql
|
-- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql
|
||||||
|
|
||||||
|
|
@ -21,10 +21,10 @@ SET work_mem = '2GB';
|
||||||
\timing on
|
\timing on
|
||||||
|
|
||||||
CREATE TEMP TABLE best_icons AS
|
CREATE TEMP TABLE best_icons AS
|
||||||
SELECT DISTINCT ON (host_id) host_id, s3_key
|
SELECT DISTINCT ON (host_id) host_id, icon_hash
|
||||||
FROM icons
|
FROM icons
|
||||||
WHERE scan_state = 'completed'
|
WHERE scan_state = 'completed'
|
||||||
AND s3_key IS NOT NULL
|
AND icon_hash IS NOT NULL
|
||||||
AND content_type != 'image/svg+xml'
|
AND content_type != 'image/svg+xml'
|
||||||
AND (width IS NULL OR width > 2)
|
AND (width IS NULL OR width > 2)
|
||||||
AND (height IS NULL OR height > 2)
|
AND (height IS NULL OR height > 2)
|
||||||
|
|
@ -50,7 +50,7 @@ ORDER BY host_id,
|
||||||
|
|
||||||
\echo 'Step 2: Updating hosts...'
|
\echo 'Step 2: Updating hosts...'
|
||||||
|
|
||||||
UPDATE hosts h SET best_icon_s3_key = b.s3_key
|
UPDATE hosts h SET best_icon_hash = b.icon_hash
|
||||||
FROM best_icons b WHERE h.id = b.host_id;
|
FROM best_icons b WHERE h.id = b.host_id;
|
||||||
|
|
||||||
\timing off
|
\timing off
|
||||||
|
|
@ -61,10 +61,10 @@ DROP TABLE best_icons;
|
||||||
\echo '--- Best Icon Selection Stats ---'
|
\echo '--- Best Icon Selection Stats ---'
|
||||||
|
|
||||||
SELECT
|
SELECT
|
||||||
COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon,
|
COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL) AS hosts_with_icon,
|
||||||
COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon,
|
COUNT(*) FILTER (WHERE best_icon_hash IS NULL) AS hosts_without_icon,
|
||||||
COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title,
|
COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title,
|
||||||
COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon
|
COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) AS title_but_no_icon
|
||||||
FROM hosts
|
FROM hosts
|
||||||
WHERE parsed = TRUE;
|
WHERE parsed = TRUE;
|
||||||
|
|
||||||
|
|
@ -74,11 +74,11 @@ WHERE parsed = TRUE;
|
||||||
\a
|
\a
|
||||||
\o stats/04_best_icon.json
|
\o stats/04_best_icon.json
|
||||||
SELECT json_build_object(
|
SELECT json_build_object(
|
||||||
'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL),
|
'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL),
|
||||||
'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL),
|
'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NULL),
|
||||||
'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL),
|
'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL),
|
||||||
'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL),
|
'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL),
|
||||||
'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL)
|
'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL)
|
||||||
)
|
)
|
||||||
FROM hosts
|
FROM hosts
|
||||||
WHERE parsed = TRUE;
|
WHERE parsed = TRUE;
|
||||||
|
|
|
||||||
|
|
@ -31,11 +31,11 @@ func buildEntry(host HostRow, iconsDir string, logWriter *LogWriter, stats *Stat
|
||||||
IframeOk: host.IframeAllowed,
|
IframeOk: host.IframeAllowed,
|
||||||
}
|
}
|
||||||
|
|
||||||
if host.BestIconS3Key == "" {
|
if host.BestIconHash == "" {
|
||||||
return entry
|
return entry
|
||||||
}
|
}
|
||||||
|
|
||||||
encoded, w, h, convertErr := safeConvert(host.BestIconS3Key, iconsDir)
|
encoded, w, h, convertErr := safeConvert(host.BestIconHash, iconsDir)
|
||||||
if convertErr != "" {
|
if convertErr != "" {
|
||||||
stats.ConvertErrors.Add(1)
|
stats.ConvertErrors.Add(1)
|
||||||
logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr)
|
logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr)
|
||||||
|
|
|
||||||
|
|
@ -12,14 +12,14 @@ type HostRow struct {
|
||||||
Protocol string
|
Protocol string
|
||||||
HtmlTitle string
|
HtmlTitle string
|
||||||
IframeAllowed bool
|
IframeAllowed bool
|
||||||
BestIconS3Key string
|
BestIconHash string
|
||||||
RandomOrder float64
|
RandomOrder float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchHostsPage gets a page of hosts with titles, ordered by random_order for shuffled bundles.
|
// fetchHostsPage gets a page of hosts with titles, ordered by random_order for shuffled bundles.
|
||||||
func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, limit int) ([]HostRow, error) {
|
func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, limit int) ([]HostRow, error) {
|
||||||
rows, err := pool.Query(ctx, `
|
rows, err := pool.Query(ctx, `
|
||||||
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_s3_key, ''), random_order
|
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order
|
||||||
FROM hosts
|
FROM hosts
|
||||||
WHERE html_title IS NOT NULL AND random_order > $1
|
WHERE html_title IS NOT NULL AND random_order > $1
|
||||||
ORDER BY random_order
|
ORDER BY random_order
|
||||||
|
|
@ -33,7 +33,7 @@ func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64,
|
||||||
var hosts []HostRow
|
var hosts []HostRow
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var h HostRow
|
var h HostRow
|
||||||
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key, &h.RandomOrder); err != nil {
|
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
hosts = append(hosts, h)
|
hosts = append(hosts, h)
|
||||||
|
|
|
||||||
|
|
@ -93,7 +93,7 @@ func main() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to count hosts: %v", err)
|
log.Fatalf("Failed to count hosts: %v", err)
|
||||||
}
|
}
|
||||||
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_s3_key IS NOT NULL`).Scan(&hostsWithIcon)
|
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_hash IS NOT NULL`).Scan(&hostsWithIcon)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to count icons: %v", err)
|
log.Fatalf("Failed to count icons: %v", err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue