updated s3_key name to icon_hash

This commit is contained in:
Joe Lothan 2026-05-25 21:05:26 -04:00
parent e308718eb2
commit 33bd0a221e
8 changed files with 31 additions and 31 deletions

View file

@ -12,7 +12,7 @@ CREATE TABLE IF NOT EXISTS hosts (
warc_record_length INT NOT NULL,
html_title TEXT,
iframe_allowed BOOLEAN,
best_icon_s3_key TEXT,
best_icon_hash TEXT,
parsed BOOLEAN DEFAULT FALSE,
random_order DOUBLE PRECISION DEFAULT random()
);
@ -28,7 +28,7 @@ CREATE TABLE IF NOT EXISTS icons (
width INT,
height INT,
file_size INT,
s3_key TEXT,
icon_hash TEXT,
scan_state TEXT DEFAULT 'unscanned',
error TEXT,
downloaded_at TIMESTAMPTZ

View file

@ -43,7 +43,7 @@ func claimBatch(ctx context.Context, pool *pgxpool.Pool, limit int) ([]IconRow,
// DownloadResult holds the outcome of downloading one icon.
type DownloadResult struct {
S3Key string
IconHash string
ContentType string
Width int
Height int
@ -65,14 +65,14 @@ func updateIcon(ctx context.Context, pool *pgxpool.Pool, iconID int64, result Do
_, err := pool.Exec(ctx, `
UPDATE icons SET
scan_state = 'completed',
s3_key = $1,
icon_hash = $1,
content_type = $2,
width = $3,
height = $4,
file_size = $5,
downloaded_at = now()
WHERE id = $6`,
result.S3Key, result.ContentType,
result.IconHash, result.ContentType,
nilIntIf(result.Width, 0), nilIntIf(result.Height, 0),
result.FileSize, iconID)
return err

View file

@ -37,22 +37,22 @@ func processIcon(icon IconRow, cfg Config) DownloadResult {
// Compute SHA-256 for content-addressed storage
hash := sha256.Sum256(data)
s3Key := hex.EncodeToString(hash[:])
iconHash := hex.EncodeToString(hash[:])
// Write to disk (skip if already exists — dedup)
dedup := false
if !cfg.DryRun {
if iconExists(s3Key) {
if iconExists(iconHash) {
dedup = true
} else {
if err := iconWrite(s3Key, data); err != nil {
if err := iconWrite(iconHash, data); err != nil {
return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"}
}
}
}
return DownloadResult{
S3Key: s3Key,
IconHash: iconHash,
ContentType: contentType,
Width: width,
Height: height,

View file

@ -1,5 +1,5 @@
-- Best Icon Selection
-- Picks the best completed icon for each host and stores its s3_key in hosts.best_icon_s3_key.
-- Picks the best completed icon for each host and stores its icon_hash in hosts.best_icon_hash.
--
-- Target: 32x32 source icon (displayed at 16x16 CSS, crisp on 2x Retina).
--
@ -11,7 +11,7 @@
-- SVGs excluded (not supported in bundle generation). Icons ≤2x2 excluded (tracking pixels).
--
-- Two-step: SELECT into temp table (index-only scan, no heap), then single bulk UPDATE.
-- Requires idx_icons_best covering index on (host_id) INCLUDE (s3_key, content_type, width, height, file_size).
-- Requires idx_icons_best covering index on (host_id) INCLUDE (icon_hash, content_type, width, height, file_size).
--
-- Usage: psql $DATABASE_URL -f pipeline/04_best_icon/select.sql
@ -21,10 +21,10 @@ SET work_mem = '2GB';
\timing on
CREATE TEMP TABLE best_icons AS
SELECT DISTINCT ON (host_id) host_id, s3_key
SELECT DISTINCT ON (host_id) host_id, icon_hash
FROM icons
WHERE scan_state = 'completed'
AND s3_key IS NOT NULL
AND icon_hash IS NOT NULL
AND content_type != 'image/svg+xml'
AND (width IS NULL OR width > 2)
AND (height IS NULL OR height > 2)
@ -50,7 +50,7 @@ ORDER BY host_id,
\echo 'Step 2: Updating hosts...'
UPDATE hosts h SET best_icon_s3_key = b.s3_key
UPDATE hosts h SET best_icon_hash = b.icon_hash
FROM best_icons b WHERE h.id = b.host_id;
\timing off
@ -61,10 +61,10 @@ DROP TABLE best_icons;
\echo '--- Best Icon Selection Stats ---'
SELECT
COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL) AS hosts_with_icon,
COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL) AS hosts_without_icon,
COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL) AS hosts_with_icon,
COUNT(*) FILTER (WHERE best_icon_hash IS NULL) AS hosts_without_icon,
COUNT(*) FILTER (WHERE html_title IS NOT NULL) AS hosts_with_title,
COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon
COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL) AS title_but_no_icon
FROM hosts
WHERE parsed = TRUE;
@ -74,11 +74,11 @@ WHERE parsed = TRUE;
\a
\o stats/04_best_icon.json
SELECT json_build_object(
'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL),
'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL),
'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NOT NULL),
'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_hash IS NULL),
'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL),
'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL),
'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL)
'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_hash IS NULL)
)
FROM hosts
WHERE parsed = TRUE;

View file

@ -31,11 +31,11 @@ func buildEntry(host HostRow, iconsDir string, logWriter *LogWriter, stats *Stat
IframeOk: host.IframeAllowed,
}
if host.BestIconS3Key == "" {
if host.BestIconHash == "" {
return entry
}
encoded, w, h, convertErr := safeConvert(host.BestIconS3Key, iconsDir)
encoded, w, h, convertErr := safeConvert(host.BestIconHash, iconsDir)
if convertErr != "" {
stats.ConvertErrors.Add(1)
logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr)

View file

@ -12,14 +12,14 @@ type HostRow struct {
Protocol string
HtmlTitle string
IframeAllowed bool
BestIconS3Key string
BestIconHash string
RandomOrder float64
}
// fetchHostsPage gets a page of hosts with titles, ordered by random_order for shuffled bundles.
func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, limit int) ([]HostRow, error) {
rows, err := pool.Query(ctx, `
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_s3_key, ''), random_order
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order
FROM hosts
WHERE html_title IS NOT NULL AND random_order > $1
ORDER BY random_order
@ -33,7 +33,7 @@ func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64,
var hosts []HostRow
for rows.Next() {
var h HostRow
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key, &h.RandomOrder); err != nil {
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder); err != nil {
return nil, err
}
hosts = append(hosts, h)

View file

@ -93,7 +93,7 @@ func main() {
if err != nil {
log.Fatalf("Failed to count hosts: %v", err)
}
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_s3_key IS NOT NULL`).Scan(&hostsWithIcon)
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_hash IS NOT NULL`).Scan(&hostsWithIcon)
if err != nil {
log.Fatalf("Failed to count icons: %v", err)
}