improved write efficency, though we are still bottlenecking on RDS - will switch to local postgres for future runs

This commit is contained in:
Joe Lothan 2026-05-20 22:38:23 -04:00
parent baf657a8ed
commit 4fa40c7b47
3 changed files with 98 additions and 77 deletions

View file

@ -59,40 +59,22 @@ type WorkResult struct {
Result ProcessResult
}
// flushResults writes a batch of successful results to the database using pgx.Batch.
// flushResults writes a batch of successful results to the database.
// Host UPDATEs use pgx.Batch. Icon INSERTs use pgx.CopyFrom for bulk throughput.
// Returns the number of DB errors encountered.
func flushResults(ctx context.Context, pool *pgxpool.Pool, results []WorkResult, logWriter *LogWriter) int {
batch := &pgx.Batch{}
dbErrors := 0
// Queue all queries
// 1. Batch UPDATE hosts (can't COPY an UPDATE)
batch := &pgx.Batch{}
for _, wr := range results {
// Update host
batch.Queue(
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
nilIfEmpty(wr.Result.Title), wr.Result.IframeAllowed, wr.Host.ID,
)
// Insert /favicon.ico
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", wr.Host.Protocol, wr.Host.Hostname)
batch.Queue(
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
wr.Host.ID, faviconURL,
)
// Insert link_rel icons
for _, icon := range wr.Result.Icons {
batch.Queue(
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
wr.Host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes),
)
}
}
// Send all queries in one round-trip
br := pool.SendBatch(ctx, batch)
// Check results
dbErrors := 0
for _, wr := range results {
// host update
if _, err := br.Exec(); err != nil {
dbErrors++
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", wr.Host.Hostname, err)
@ -101,29 +83,35 @@ func flushResults(ctx context.Context, pool *pgxpool.Pool, results []WorkResult,
logWriter.Write(logLine, true)
}
}
// favicon.ico insert
if _, err := br.Exec(); err != nil {
dbErrors++
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
}
// link_rel icon inserts
for range wr.Result.Icons {
if _, err := br.Exec(); err != nil {
dbErrors++
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
}
}
br.Close()
// 2. COPY icons in bulk (much less IOPS than individual INSERTs)
var iconRows [][]any
for _, wr := range results {
// favicon.ico entry
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", wr.Host.Protocol, wr.Host.Hostname)
iconRows = append(iconRows, []any{wr.Host.ID, faviconURL, "favicon_ico", nil, nil})
// link_rel entries
for _, icon := range wr.Result.Icons {
iconRows = append(iconRows, []any{wr.Host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes)})
}
}
_, err := pool.CopyFrom(ctx,
pgx.Identifier{"icons"},
[]string{"host_id", "url", "source", "rel_type", "rel_sizes"},
pgx.CopyFromRows(iconRows),
)
if err != nil {
dbErrors++
logLine := fmt.Sprintf("DB_ERROR: icon COPY failed (%d rows): %v", len(iconRows), err)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, true)
}
}
br.Close()
return dbErrors
}