order by downloaded time to improve ebs read performance

This commit is contained in:
Joe Lothan 2026-05-26 23:10:53 -04:00
parent df2eaa251c
commit a799c05f81
4 changed files with 28 additions and 19 deletions

View file

@ -13,6 +13,7 @@ CREATE TABLE IF NOT EXISTS hosts (
html_title TEXT,
iframe_allowed BOOLEAN,
best_icon_hash TEXT,
icon_downloaded_at TIMESTAMPTZ,
parsed BOOLEAN DEFAULT FALSE,
random_order DOUBLE PRECISION DEFAULT random()
);

View file

@ -21,7 +21,7 @@ SET work_mem = '2GB';
\timing on
CREATE TEMP TABLE best_icons AS
SELECT DISTINCT ON (host_id) host_id, icon_hash
SELECT DISTINCT ON (host_id) host_id, icon_hash, downloaded_at
FROM icons
WHERE scan_state = 'completed'
AND icon_hash IS NOT NULL
@ -50,7 +50,7 @@ ORDER BY host_id,
\echo 'Step 2: Updating hosts...'
UPDATE hosts h SET best_icon_hash = b.icon_hash
UPDATE hosts h SET best_icon_hash = b.icon_hash, icon_downloaded_at = b.downloaded_at
FROM best_icons b WHERE h.id = b.host_id;
\timing off

View file

@ -2,30 +2,35 @@ package main
import (
"context"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
type HostRow struct {
ID int64
Hostname string
Protocol string
HtmlTitle string
IframeAllowed bool
BestIconHash string
RandomOrder float64
ID int64
Hostname string
Protocol string
HtmlTitle string
IframeAllowed bool
BestIconHash string
RandomOrder float64
IconDownloadedAt *time.Time
}
// fetchHostsPage gets a page of hosts with titles, ordered by id for disk locality.
// fetchHostsPage gets a page of hosts with titles, ordered by icon_downloaded_at for disk locality.
// Icons written to disk at similar times are physically adjacent — reading in write order
// maximizes OS readahead cache hits. Hosts without icons come last (no disk reads needed).
// random_order is included for bundle bucket assignment (randomized bundles).
func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]HostRow, error) {
func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastDownloaded *time.Time, lastID int64, limit int) ([]HostRow, error) {
rows, err := pool.Query(ctx, `
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order
SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order, icon_downloaded_at
FROM hosts
WHERE html_title IS NOT NULL AND id > $1
ORDER BY id
LIMIT $2
`, lastID, limit)
WHERE html_title IS NOT NULL
AND (icon_downloaded_at, id) > ($1, $2)
ORDER BY icon_downloaded_at NULLS LAST, id
LIMIT $3
`, lastDownloaded, lastID, limit)
if err != nil {
return nil, err
}
@ -34,7 +39,7 @@ func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit
var hosts []HostRow
for rows.Next() {
var h HostRow
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder); err != nil {
if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder, &h.IconDownloadedAt); err != nil {
return nil, err
}
hosts = append(hosts, h)

View file

@ -135,6 +135,7 @@ func main() {
// Stage 1: DB fetcher — continuously fetches pages into hostCh
go func() {
defer close(hostCh)
var lastDownloaded *time.Time
var lastID int64
pageSize := 50000
fetched := 0
@ -150,14 +151,16 @@ func main() {
}
}
fetchStart := time.Now()
hosts, err := fetchHostsPage(ctx, pool, lastID, limit)
hosts, err := fetchHostsPage(ctx, pool, lastDownloaded, lastID, limit)
if err != nil {
log.Fatalf("Failed to fetch hosts: %v", err)
}
if len(hosts) == 0 {
break
}
lastID = hosts[len(hosts)-1].ID
last := hosts[len(hosts)-1]
lastDownloaded = last.IconDownloadedAt
lastID = last.ID
fmt.Printf("[fetcher] %d hosts in %dms (hostCh: %d/%d)\n",
len(hosts), time.Since(fetchStart).Milliseconds(), len(hostCh), cap(hostCh))
for _, h := range hosts {