From 664197e2878fcb26e91a6ad5d2c7a7ed435cc5fc Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Sun, 17 May 2026 22:22:44 -0400 Subject: [PATCH] added select.sql query --- pipeline/04_best_icon/select.sql | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 pipeline/04_best_icon/select.sql diff --git a/pipeline/04_best_icon/select.sql b/pipeline/04_best_icon/select.sql new file mode 100644 index 0000000..0c62493 --- /dev/null +++ b/pipeline/04_best_icon/select.sql @@ -0,0 +1,47 @@ +-- Best Icon Selection +-- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key. +-- +-- Priority: +-- 1. Standard square sizes (64 > 48 > 32 > 16) — ideal for tab display +-- 2. Other square sizes ≤64 +-- 3. Non-square sizes ≤64 on both axes +-- 4. Anything larger (downloaded because rel_sizes was undeclared) +-- 5. Among equal priority: prefer PNG/GIF/ICO over WebP over SVG +-- 6. Tiebreaker: smaller file size (less bandwidth in bundles) +-- +-- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql + +UPDATE hosts h SET best_icon_s3_key = sub.s3_key +FROM ( + SELECT DISTINCT ON (i.host_id) i.host_id, i.s3_key + FROM icons i + WHERE i.scan_state = 'completed' + AND i.s3_key IS NOT NULL + ORDER BY i.host_id, + CASE + WHEN i.width = i.height AND i.width IN (64, 48, 32, 16) THEN 0 + WHEN i.width = i.height AND i.width <= 64 THEN 1 + WHEN i.width IS NOT NULL AND i.width <= 64 AND i.height IS NOT NULL AND i.height <= 64 THEN 2 + ELSE 3 + END, + COALESCE(i.width, 0) DESC, + CASE + WHEN i.content_type IN ('image/png', 'image/gif', 'image/x-icon', 'image/vnd.microsoft.icon') THEN 0 + WHEN i.content_type = 'image/webp' THEN 1 + WHEN i.content_type = 'image/svg+xml' THEN 2 + ELSE 3 + END, + i.file_size ASC +) sub +WHERE h.id = sub.host_id; + +-- Stats +\echo '--- Best Icon Selection Stats ---' + +SELECT + COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon, + COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon, + COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title, + COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon +FROM hosts +WHERE parsed = TRUE;