-- Best Icon Selection -- Picks the best completed icon for each host and stores its icon_hash in hosts.best_icon_hash. -- -- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina). -- -- Priority: -- 1. Icons ≥32px: prefer smallest first (closest to 32 — a 32x32 beats a 48x48 beats a 180x180) -- 2. Icons <32px: prefer largest first (16x16 beats 8x8) -- 3. Within same size: prefer PNG > ICO > GIF/JPEG/BMP > WebP -- 4. Tiebreaker: smaller file size -- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels). -- -- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE. -- Requires idx_icons_best covering index on (host_id) INCLUDE (icon_hash, content_type, width, height, file_size). -- -- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql SET work_mem = '2GB'; \echo 'Step 1: Finding best icon per host...' \timing on CREATE TEMP TABLE best_icons AS SELECT DISTINCT ON (host_id) host_id, icon_hash FROM icons WHERE scan_state = 'completed' AND icon_hash IS NOT NULL AND content_type != 'image/svg+xml' AND (width IS NULL OR width > 2) AND (height IS NULL OR height > 2) ORDER BY host_id, CASE WHEN LEAST(COALESCE(width, 0), COALESCE(height, 0)) >= 32 THEN 0 WHEN COALESCE(width, 0) > 0 THEN 1 ELSE 2 END, CASE WHEN LEAST(COALESCE(width, 0), COALESCE(height, 0)) >= 32 THEN GREATEST(COALESCE(width, 0), COALESCE(height, 0)) ELSE -GREATEST(COALESCE(width, 0), COALESCE(height, 0)) END, CASE WHEN content_type = 'image/png' THEN 0 WHEN content_type IN ('image/x-icon', 'image/vnd.microsoft.icon') THEN 1 WHEN content_type IN ('image/gif', 'image/jpeg', 'image/bmp') THEN 2 WHEN content_type = 'image/webp' THEN 3 ELSE 4 END, file_size ASC; \echo 'Step 2: Updating hosts...' UPDATE hosts h SET best_icon_hash = b.icon_hash FROM best_icons b WHERE h.id = b.host_id; \timing off DROP TABLE best_icons; -- Stats (human-readable) \echo '--- Best Icon Selection Stats ---' SELECT COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL) AS hosts_with_icon, COUNT(*) FILTER (WHERE best_icon_hash IS NULL) AS hosts_without_icon, COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title, COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) AS title_but_no_icon FROM hosts WHERE parsed = TRUE; -- Stats JSON \! mkdir -p stats \t on \a \o stats/04_best_icon.json SELECT json_build_object( 'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL), 'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NULL), 'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL), 'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL), 'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) ) FROM hosts WHERE parsed = TRUE; \o \t off \a \echo 'Stats written to stats/04_best_icon.json'