From ca90b7071e6e3971aef6fd941a45e1da0e370164 Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Mon, 25 May 2026 14:16:40 -0400 Subject: [PATCH] optimize db for bulk insert by turning off indexes and vacuum --- pipeline/02_warc_parse/main.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pipeline/02_warc_parse/main.go b/pipeline/02_warc_parse/main.go index c7f7906..3895b15 100644 --- a/pipeline/02_warc_parse/main.go +++ b/pipeline/02_warc_parse/main.go @@ -87,12 +87,17 @@ func main() { defer logWriter.Close() } - // Disable autovacuum during heavy writes — it competes for disk I/O - // and causes writer stalls. Re-enabled + manual VACUUM at end. + // Optimize DB for bulk loading: + // - Disable autovacuum (competes for disk I/O, causes writer stalls) + // - Drop icons indexes (index maintenance during 80M inserts is the main IOPS cost) + // Both are re-enabled/recreated at the end. if !cfg.DryRun { - fmt.Println("Disabling autovacuum on hosts and icons tables...") + fmt.Println("Preparing DB for bulk load...") pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = false)") pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = false)") + pool.Exec(ctx, "DROP INDEX IF EXISTS idx_icons_unscanned") + pool.Exec(ctx, "DROP INDEX IF EXISTS idx_icons_host_id") + fmt.Println("Autovacuum disabled, icons indexes dropped") } stats := &Stats{StartedAt: time.Now()} @@ -237,14 +242,17 @@ func main() { } writerWg.Wait() - // Re-enable autovacuum and run manual vacuum + // Restore DB: recreate indexes, re-enable autovacuum, update planner stats if !cfg.DryRun { - fmt.Println("Re-enabling autovacuum and running VACUUM ANALYZE...") + fmt.Println("Recreating icons indexes...") + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned'") + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id)") + fmt.Println("Indexes created. Running ANALYZE...") pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = true)") pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = true)") - pool.Exec(ctx, "VACUUM ANALYZE hosts") - pool.Exec(ctx, "VACUUM ANALYZE icons") - fmt.Println("VACUUM complete") + pool.Exec(ctx, "ANALYZE hosts") + pool.Exec(ctx, "ANALYZE icons") + fmt.Println("ANALYZE complete") } // Print summary