improved write efficency, though we are still bottlenecking on RDS - will switch to local postgres for future runs
This commit is contained in:
parent
baf657a8ed
commit
4fa40c7b47
3 changed files with 98 additions and 77 deletions
|
|
@ -59,40 +59,22 @@ type WorkResult struct {
|
|||
Result ProcessResult
|
||||
}
|
||||
|
||||
// flushResults writes a batch of successful results to the database using pgx.Batch.
|
||||
// flushResults writes a batch of successful results to the database.
|
||||
// Host UPDATEs use pgx.Batch. Icon INSERTs use pgx.CopyFrom for bulk throughput.
|
||||
// Returns the number of DB errors encountered.
|
||||
func flushResults(ctx context.Context, pool *pgxpool.Pool, results []WorkResult, logWriter *LogWriter) int {
|
||||
batch := &pgx.Batch{}
|
||||
dbErrors := 0
|
||||
|
||||
// Queue all queries
|
||||
// 1. Batch UPDATE hosts (can't COPY an UPDATE)
|
||||
batch := &pgx.Batch{}
|
||||
for _, wr := range results {
|
||||
// Update host
|
||||
batch.Queue(
|
||||
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
|
||||
nilIfEmpty(wr.Result.Title), wr.Result.IframeAllowed, wr.Host.ID,
|
||||
)
|
||||
// Insert /favicon.ico
|
||||
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", wr.Host.Protocol, wr.Host.Hostname)
|
||||
batch.Queue(
|
||||
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
|
||||
wr.Host.ID, faviconURL,
|
||||
)
|
||||
// Insert link_rel icons
|
||||
for _, icon := range wr.Result.Icons {
|
||||
batch.Queue(
|
||||
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
|
||||
wr.Host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Send all queries in one round-trip
|
||||
br := pool.SendBatch(ctx, batch)
|
||||
|
||||
// Check results
|
||||
dbErrors := 0
|
||||
for _, wr := range results {
|
||||
// host update
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", wr.Host.Hostname, err)
|
||||
|
|
@ -101,29 +83,35 @@ func flushResults(ctx context.Context, pool *pgxpool.Pool, results []WorkResult,
|
|||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
// favicon.ico insert
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
// link_rel icon inserts
|
||||
for range wr.Result.Icons {
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
br.Close()
|
||||
|
||||
// 2. COPY icons in bulk (much less IOPS than individual INSERTs)
|
||||
var iconRows [][]any
|
||||
for _, wr := range results {
|
||||
// favicon.ico entry
|
||||
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", wr.Host.Protocol, wr.Host.Hostname)
|
||||
iconRows = append(iconRows, []any{wr.Host.ID, faviconURL, "favicon_ico", nil, nil})
|
||||
// link_rel entries
|
||||
for _, icon := range wr.Result.Icons {
|
||||
iconRows = append(iconRows, []any{wr.Host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes)})
|
||||
}
|
||||
}
|
||||
|
||||
_, err := pool.CopyFrom(ctx,
|
||||
pgx.Identifier{"icons"},
|
||||
[]string{"host_id", "url", "source", "rel_type", "rel_sizes"},
|
||||
pgx.CopyFromRows(iconRows),
|
||||
)
|
||||
if err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: icon COPY failed (%d rows): %v", len(iconRows), err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
|
||||
br.Close()
|
||||
return dbErrors
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ func main() {
|
|||
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
|
||||
flag.IntVar(&cfg.BatchSize, "batch-size", 5000, "Rows to fetch per batch")
|
||||
flag.IntVar(&cfg.Concurrency, "concurrency", 500, "Number of concurrent goroutines")
|
||||
flag.IntVar(&cfg.WriteBatch, "write-batch", 100, "Results to batch per DB write")
|
||||
flag.IntVar(&cfg.WriteBatch, "write-batch", 1000, "Results to batch per DB write")
|
||||
flag.IntVar(&cfg.Limit, "limit", 0, "Max rows to process (0 = all)")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Print results without writing to DB")
|
||||
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
|
||||
|
|
@ -68,26 +68,13 @@ func main() {
|
|||
}
|
||||
defer pool.Close()
|
||||
|
||||
// Get total count
|
||||
var total int64
|
||||
if cfg.Limit > 0 {
|
||||
total = int64(cfg.Limit)
|
||||
} else {
|
||||
err = pool.QueryRow(ctx, "SELECT COUNT(*) FROM hosts WHERE parsed = FALSE").Scan(&total)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to count unparsed hosts: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
fmt.Println("No unparsed hosts found.")
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("=== WARC Parser ===\n")
|
||||
fmt.Printf("Unparsed hosts: %d\n", total)
|
||||
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
|
||||
fmt.Printf("Batch size: %d\n", cfg.BatchSize)
|
||||
fmt.Printf("Write batch: %d\n", cfg.WriteBatch)
|
||||
if cfg.Limit > 0 {
|
||||
fmt.Printf("Limit: %d\n", cfg.Limit)
|
||||
}
|
||||
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
|
||||
|
||||
// Setup log file
|
||||
|
|
@ -100,6 +87,14 @@ func main() {
|
|||
defer logWriter.Close()
|
||||
}
|
||||
|
||||
// Disable autovacuum during heavy writes — it competes for disk I/O
|
||||
// and causes writer stalls. Re-enabled + manual VACUUM at end.
|
||||
if !cfg.DryRun {
|
||||
fmt.Println("Disabling autovacuum on hosts and icons tables...")
|
||||
pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = false)")
|
||||
pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = false)")
|
||||
}
|
||||
|
||||
stats := &Stats{StartedAt: time.Now()}
|
||||
|
||||
// Three-stage pipeline:
|
||||
|
|
@ -125,6 +120,7 @@ func main() {
|
|||
limit = remaining
|
||||
}
|
||||
}
|
||||
fetchStart := time.Now()
|
||||
hosts, err := fetchBatch(ctx, pool, lastID, limit)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to fetch batch: %v", err)
|
||||
|
|
@ -133,6 +129,8 @@ func main() {
|
|||
break
|
||||
}
|
||||
lastID = hosts[len(hosts)-1].ID
|
||||
fmt.Printf("[fetcher] %d hosts in %dms (hostCh: %d/%d)\n",
|
||||
len(hosts), time.Since(fetchStart).Milliseconds(), len(hostCh), cap(hostCh))
|
||||
for _, h := range hosts {
|
||||
hostCh <- h
|
||||
}
|
||||
|
|
@ -200,25 +198,53 @@ func main() {
|
|||
close(resultCh)
|
||||
}()
|
||||
|
||||
// Stage 3: DB writer — batches writes for efficiency
|
||||
// Stage 3: DB writers — multiple goroutines batch writes for efficiency
|
||||
const numWriters = 3
|
||||
var writerWg sync.WaitGroup
|
||||
if !cfg.DryRun {
|
||||
var buf []WorkResult
|
||||
for wr := range resultCh {
|
||||
buf = append(buf, wr)
|
||||
if len(buf) >= cfg.WriteBatch {
|
||||
dbErrs := flushResults(ctx, pool, buf, logWriter)
|
||||
stats.DBErrors.Add(int64(dbErrs))
|
||||
buf = buf[:0]
|
||||
}
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
dbErrs := flushResults(ctx, pool, buf, logWriter)
|
||||
stats.DBErrors.Add(int64(dbErrs))
|
||||
for w := 0; w < numWriters; w++ {
|
||||
writerWg.Add(1)
|
||||
go func() {
|
||||
defer writerWg.Done()
|
||||
var buf []WorkResult
|
||||
for wr := range resultCh {
|
||||
buf = append(buf, wr)
|
||||
if len(buf) >= cfg.WriteBatch {
|
||||
flushStart := time.Now()
|
||||
dbErrs := flushResults(ctx, pool, buf, logWriter)
|
||||
stats.DBErrors.Add(int64(dbErrs))
|
||||
flushMs := time.Since(flushStart).Milliseconds()
|
||||
if flushMs > 100 {
|
||||
fmt.Printf("[writer] flushed %d results in %dms (resultCh: %d/%d)\n",
|
||||
len(buf), flushMs, len(resultCh), cap(resultCh))
|
||||
}
|
||||
buf = buf[:0]
|
||||
}
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
dbErrs := flushResults(ctx, pool, buf, logWriter)
|
||||
stats.DBErrors.Add(int64(dbErrs))
|
||||
}
|
||||
}()
|
||||
}
|
||||
} else {
|
||||
for range resultCh {
|
||||
// drain in dry-run mode
|
||||
}
|
||||
writerWg.Add(1)
|
||||
go func() {
|
||||
defer writerWg.Done()
|
||||
for range resultCh {
|
||||
}
|
||||
}()
|
||||
}
|
||||
writerWg.Wait()
|
||||
|
||||
// Re-enable autovacuum and run manual vacuum
|
||||
if !cfg.DryRun {
|
||||
fmt.Println("Re-enabling autovacuum and running VACUUM ANALYZE...")
|
||||
pool.Exec(ctx, "ALTER TABLE hosts SET (autovacuum_enabled = true)")
|
||||
pool.Exec(ctx, "ALTER TABLE icons SET (autovacuum_enabled = true)")
|
||||
pool.Exec(ctx, "VACUUM ANALYZE hosts")
|
||||
pool.Exec(ctx, "VACUUM ANALYZE icons")
|
||||
fmt.Println("VACUUM complete")
|
||||
}
|
||||
|
||||
// Print summary
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue