improved write efficency, though we are still bottlenecking on RDS - will switch to local postgres for future runs

This commit is contained in:
Joe Lothan 2026-05-20 22:38:23 -04:00
parent baf657a8ed
commit 4fa40c7b47
3 changed files with 98 additions and 77 deletions

View file

@ -42,7 +42,7 @@ func main() {
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
flag.IntVar(&cfg.BatchSize, "batch-size", 5000, "Rows to fetch per batch")
flag.IntVar(&cfg.Concurrency, "concurrency", 500, "Number of concurrent goroutines")
flag.IntVar(&cfg.WriteBatch, "write-batch", 100, "Results to batch per DB write")
flag.IntVar(&cfg.WriteBatch, "write-batch", 1000, "Results to batch per DB write")
flag.IntVar(&cfg.Limit, "limit", 0, "Max rows to process (0 = all)")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Print results without writing to DB")
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
@ -68,26 +68,13 @@ func main() {
}
defer pool.Close()
// Get total count
var total int64
if cfg.Limit > 0 {
total = int64(cfg.Limit)
} else {
err = pool.QueryRow(ctx, "SELECT COUNT(*) FROM hosts WHERE parsed = FALSE").Scan(&total)
if err != nil {
log.Fatalf("Failed to count unparsed hosts: %v", err)
}
}
if total == 0 {
fmt.Println("No unparsed hosts found.")
return
}
fmt.Printf("=== WARC Parser ===\n")
fmt.Printf("Unparsed hosts: %d\n", total)
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
fmt.Printf("Batch size: %d\n", cfg.BatchSize)
fmt.Printf("Write batch: %d\n", cfg.WriteBatch)
if cfg.Limit > 0 {
fmt.Printf("Limit: %d\n", cfg.Limit)
}
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
// Setup log file
@ -100,6 +87,14 @@ func main() {
defer logWriter.Close()
}
// Disable autovacuum during heavy writes — it competes for disk I/O
// and causes writer stalls. Re-enabled + manual VACUUM at end.
if !cfg.DryRun {
fmt.Println("Disabling autovacuum on hosts and icons tables...")
pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = false)")
pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = false)")
}
stats := &Stats{StartedAt: time.Now()}
// Three-stage pipeline:
@ -125,6 +120,7 @@ func main() {
limit = remaining
}
}
fetchStart := time.Now()
hosts, err := fetchBatch(ctx, pool, lastID, limit)
if err != nil {
log.Fatalf("Failed to fetch batch: %v", err)
@ -133,6 +129,8 @@ func main() {
break
}
lastID = hosts[len(hosts)-1].ID
fmt.Printf("[fetcher] %d hosts in %dms (hostCh: %d/%d)\n",
len(hosts), time.Since(fetchStart).Milliseconds(), len(hostCh), cap(hostCh))
for _, h := range hosts {
hostCh <- h
}
@ -200,25 +198,53 @@ func main() {
close(resultCh)
}()
// Stage 3: DB writer — batches writes for efficiency
// Stage 3: DB writers — multiple goroutines batch writes for efficiency
const numWriters = 3
var writerWg sync.WaitGroup
if !cfg.DryRun {
var buf []WorkResult
for wr := range resultCh {
buf = append(buf, wr)
if len(buf) >= cfg.WriteBatch {
dbErrs := flushResults(ctx, pool, buf, logWriter)
stats.DBErrors.Add(int64(dbErrs))
buf = buf[:0]
}
}
if len(buf) > 0 {
dbErrs := flushResults(ctx, pool, buf, logWriter)
stats.DBErrors.Add(int64(dbErrs))
for w := 0; w < numWriters; w++ {
writerWg.Add(1)
go func() {
defer writerWg.Done()
var buf []WorkResult
for wr := range resultCh {
buf = append(buf, wr)
if len(buf) >= cfg.WriteBatch {
flushStart := time.Now()
dbErrs := flushResults(ctx, pool, buf, logWriter)
stats.DBErrors.Add(int64(dbErrs))
flushMs := time.Since(flushStart).Milliseconds()
if flushMs > 100 {
fmt.Printf("[writer] flushed %d results in %dms (resultCh: %d/%d)\n",
len(buf), flushMs, len(resultCh), cap(resultCh))
}
buf = buf[:0]
}
}
if len(buf) > 0 {
dbErrs := flushResults(ctx, pool, buf, logWriter)
stats.DBErrors.Add(int64(dbErrs))
}
}()
}
} else {
for range resultCh {
// drain in dry-run mode
}
writerWg.Add(1)
go func() {
defer writerWg.Done()
for range resultCh {
}
}()
}
writerWg.Wait()
// Re-enable autovacuum and run manual vacuum
if !cfg.DryRun {
fmt.Println("Re-enabling autovacuum and running VACUUM ANALYZE...")
pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = true)")
pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = true)")
pool.Exec(ctx, "VACUUM ANALYZE hosts")
pool.Exec(ctx, "VACUUM ANALYZE icons")
fmt.Println("VACUUM complete")
}
// Print summary