From a30fe104a3a734feff1664cc0e30518a5cb1db1d Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Tue, 26 May 2026 00:43:11 -0400 Subject: [PATCH] sort by host id to test iops cheese to improve icon read performance --- pipeline/05_bundle_gen/db.go | 27 ++++++++++++++------------- pipeline/05_bundle_gen/main.go | 6 +++--- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pipeline/05_bundle_gen/db.go b/pipeline/05_bundle_gen/db.go index 7328203..fa7ba3b 100644 --- a/pipeline/05_bundle_gen/db.go +++ b/pipeline/05_bundle_gen/db.go @@ -7,24 +7,25 @@ import ( ) type HostRow struct { - ID int64 - Hostname string - Protocol string - HtmlTitle string - IframeAllowed bool + ID int64 + Hostname string + Protocol string + HtmlTitle string + IframeAllowed bool BestIconHash string - RandomOrder float64 } -// fetchHostsPage gets a page of hosts with titles, ordered by random_order for shuffled bundles. -func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, limit int) ([]HostRow, error) { +// fetchHostsPage gets a page of hosts with titles, ordered by id for disk locality. +// Icons were downloaded roughly in host-ID order, so reading by ID approximates +// the physical write order on disk — improves EBS readahead cache hits. +func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]HostRow, error) { rows, err := pool.Query(ctx, ` - SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, ''), random_order + SELECT id, hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_hash, '') FROM hosts - WHERE html_title IS NOT NULL AND random_order > $1 - ORDER BY random_order + WHERE html_title IS NOT NULL AND id > $1 + ORDER BY id LIMIT $2 - `, lastRandom, limit) + `, lastID, limit) if err != nil { return nil, err } @@ -33,7 +34,7 @@ func fetchHostsPage(ctx context.Context, pool *pgxpool.Pool, lastRandom float64, var hosts []HostRow for rows.Next() { var h HostRow - if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash, &h.RandomOrder); err != nil { + if err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconHash); err != nil { return nil, err } hosts = append(hosts, h) diff --git a/pipeline/05_bundle_gen/main.go b/pipeline/05_bundle_gen/main.go index ac76140..bd2ad8b 100644 --- a/pipeline/05_bundle_gen/main.go +++ b/pipeline/05_bundle_gen/main.go @@ -128,7 +128,7 @@ func main() { // Stage 1: DB fetcher — continuously fetches pages into hostCh go func() { defer close(hostCh) - var lastRandom float64 = -1 + var lastID int64 pageSize := 50000 fetched := 0 for { @@ -143,14 +143,14 @@ func main() { } } fetchStart := time.Now() - hosts, err := fetchHostsPage(ctx, pool, lastRandom, limit) + hosts, err := fetchHostsPage(ctx, pool, lastID, limit) if err != nil { log.Fatalf("Failed to fetch hosts: %v", err) } if len(hosts) == 0 { break } - lastRandom = hosts[len(hosts)-1].RandomOrder + lastID = hosts[len(hosts)-1].ID fmt.Printf("[fetcher] %d hosts in %dms (hostCh: %d/%d)\n", len(hosts), time.Since(fetchStart).Milliseconds(), len(hostCh), cap(hostCh)) for _, h := range hosts {