diff --git a/pipeline/04_best_icon/select.sql b/pipeline/04_best_icon/select.sql index 296c319..2e7dbea 100644 --- a/pipeline/04_best_icon/select.sql +++ b/pipeline/04_best_icon/select.sql @@ -10,41 +10,52 @@ -- 4. Tiebreaker: smaller file size -- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels). -- +-- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE. +-- Requires idx_icons_best covering index on (host_id) INCLUDE (s3_key, content_type, width, height, file_size). +-- -- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql -UPDATE hosts h SET best_icon_s3_key = sub.s3_key -FROM ( - SELECT DISTINCT ON (i.host_id) i.host_id, i.s3_key - FROM icons i - WHERE i.scan_state = 'completed' - AND i.s3_key IS NOT NULL - AND i.content_type != 'image/svg+xml' - AND (i.width IS NULL OR i.width > 2) - AND (i.height IS NULL OR i.height > 2) - ORDER BY i.host_id, - -- Tier: ≥32 preferred over <32. NULL dimensions go last. - CASE - WHEN LEAST(COALESCE(i.width, 0), COALESCE(i.height, 0)) >= 32 THEN 0 - WHEN COALESCE(i.width, 0) > 0 THEN 1 - ELSE 2 - END, - -- Within ≥32: smallest first (closest to 32). Within <32: largest first. - CASE - WHEN LEAST(COALESCE(i.width, 0), COALESCE(i.height, 0)) >= 32 - THEN GREATEST(COALESCE(i.width, 0), COALESCE(i.height, 0)) - ELSE -GREATEST(COALESCE(i.width, 0), COALESCE(i.height, 0)) - END, - -- Format preference - CASE - WHEN i.content_type = 'image/png' THEN 0 - WHEN i.content_type IN ('image/x-icon', 'image/vnd.microsoft.icon') THEN 1 - WHEN i.content_type IN ('image/gif', 'image/jpeg', 'image/bmp') THEN 2 - WHEN i.content_type = 'image/webp' THEN 3 - ELSE 4 - END, - i.file_size ASC -) sub -WHERE h.id = sub.host_id; +SET work_mem = '2GB'; + +\echo 'Step 1: Finding best icon per host...' +\timing on + +CREATE TEMP TABLE best_icons AS +SELECT DISTINCT ON (host_id) host_id, s3_key +FROM icons +WHERE scan_state = 'completed' + AND s3_key IS NOT NULL + AND content_type != 'image/svg+xml' + AND (width IS NULL OR width > 2) + AND (height IS NULL OR height > 2) +ORDER BY host_id, + CASE + WHEN LEAST(COALESCE(width, 0), COALESCE(height, 0)) >= 32 THEN 0 + WHEN COALESCE(width, 0) > 0 THEN 1 + ELSE 2 + END, + CASE + WHEN LEAST(COALESCE(width, 0), COALESCE(height, 0)) >= 32 + THEN GREATEST(COALESCE(width, 0), COALESCE(height, 0)) + ELSE -GREATEST(COALESCE(width, 0), COALESCE(height, 0)) + END, + CASE + WHEN content_type = 'image/png' THEN 0 + WHEN content_type IN ('image/x-icon', 'image/vnd.microsoft.icon') THEN 1 + WHEN content_type IN ('image/gif', 'image/jpeg', 'image/bmp') THEN 2 + WHEN content_type = 'image/webp' THEN 3 + ELSE 4 + END, + file_size ASC; + +\echo 'Step 2: Updating hosts...' + +UPDATE hosts h SET best_icon_s3_key = b.s3_key +FROM best_icons b WHERE h.id = b.host_id; + +\timing off + +DROP TABLE best_icons; -- Stats (human-readable) \echo '--- Best Icon Selection Stats ---'