-- Best Icon Selection -- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key. -- -- Priority: -- 1. Standard square sizes (64 > 48 > 32 > 16) — ideal for tab display -- 2. Other square sizes ≤64 -- 3. Non-square sizes ≤64 on both axes -- 4. Anything larger (downloaded because rel_sizes was undeclared) -- 5. Among equal priority: prefer PNG/GIF/ICO over WebP (SVGs excluded — not supported in bundle generation) -- 6. Tiebreaker: smaller file size (less bandwidth in bundles) -- -- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql UPDATE hosts h SET best_icon_s3_key = sub.s3_key FROM ( SELECT DISTINCT ON (i.host_id) i.host_id, i.s3_key FROM icons i WHERE i.scan_state = 'completed' AND i.s3_key IS NOT NULL AND i.content_type NOT IN ('image/svg+xml') AND (i.width IS NULL OR i.width > 2) AND (i.height IS NULL OR i.height > 2) ORDER BY i.host_id, CASE WHEN i.width = i.height AND i.width IN (64, 48, 32, 16) THEN 0 WHEN i.width = i.height AND i.width <= 64 THEN 1 WHEN i.width IS NOT NULL AND i.width <= 64 AND i.height IS NOT NULL AND i.height <= 64 THEN 2 ELSE 3 END, COALESCE(i.width, 0) DESC, CASE WHEN i.content_type IN ('image/png', 'image/gif', 'image/x-icon', 'image/vnd.microsoft.icon') THEN 0 WHEN i.content_type = 'image/webp' THEN 1 ELSE 2 END, i.file_size ASC ) sub WHERE h.id = sub.host_id; -- Stats (human-readable) \echo '--- Best Icon Selection Stats ---' SELECT COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon, COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon, COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title, COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon FROM hosts WHERE parsed = TRUE; -- Stats JSON \! mkdir -p stats \t on \a \o stats/04_best_icon.json SELECT json_build_object( 'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL), 'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL), 'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL), 'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL), 'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) ) FROM hosts WHERE parsed = TRUE; \o \t off \a \echo 'Stats written to stats/04_best_icon.json'