everytab/pipeline/04_best_icon/select.sql

77 lines
2.8 KiB
SQL

-- Best Icon Selection
-- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key.
--
-- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina).
--
-- Priority:
-- 1. Icons ≥32px: prefer smallest first (closest to 32 — a 32x32 beats a 48x48 beats a 180x180)
-- 2. Icons <32px: prefer largest first (16x16 beats 8x8)
-- 3. Within same size: prefer PNG > ICO > GIF/JPEG/BMP > WebP
-- 4. Tiebreaker: smaller file size
-- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels).
--
-- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql
UPDATE hosts h SET best_icon_s3_key = sub.s3_key
FROM (
SELECT DISTINCT ON (i.host_id) i.host_id, i.s3_key
FROM icons i
WHERE i.scan_state = 'completed'
AND i.s3_key IS NOT NULL
AND i.content_type != 'image/svg+xml'
AND (i.width IS NULL OR i.width > 2)
AND (i.height IS NULL OR i.height > 2)
ORDER BY i.host_id,
-- Tier: ≥32 preferred over <32. NULL dimensions go last.
CASE
WHEN LEAST(COALESCE(i.width, 0), COALESCE(i.height, 0)) >= 32 THEN 0
WHEN COALESCE(i.width, 0) > 0 THEN 1
ELSE 2
END,
-- Within ≥32: smallest first (closest to 32). Within <32: largest first.
CASE
WHEN LEAST(COALESCE(i.width, 0), COALESCE(i.height, 0)) >= 32
THEN GREATEST(COALESCE(i.width, 0), COALESCE(i.height, 0))
ELSE -GREATEST(COALESCE(i.width, 0), COALESCE(i.height, 0))
END,
-- Format preference
CASE
WHEN i.content_type = 'image/png' THEN 0
WHEN i.content_type IN ('image/x-icon', 'image/vnd.microsoft.icon') THEN 1
WHEN i.content_type IN ('image/gif', 'image/jpeg', 'image/bmp') THEN 2
WHEN i.content_type = 'image/webp' THEN 3
ELSE 4
END,
i.file_size ASC
) sub
WHERE h.id = sub.host_id;
-- Stats (human-readable)
\echo '--- Best Icon Selection Stats ---'
SELECT
COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon,
COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon,
COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title,
COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon
FROM hosts
WHERE parsed = TRUE;
-- Stats JSON
\! mkdir -p stats
\t on
\a
\o stats/04_best_icon.json
SELECT json_build_object(
'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL),
'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL),
'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL),
'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL),
'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL)
)
FROM hosts
WHERE parsed = TRUE;
\o
\t off
\a
\echo 'Stats written to stats/04_best_icon.json'