update warc parsing with new 3 stage producer, worker, consumer model, increasing speed and saturating cores
This commit is contained in:
parent
0efec72e45
commit
6d8ba61102
2 changed files with 169 additions and 125 deletions
|
|
@ -4,6 +4,7 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
|
|
@ -26,12 +27,6 @@ type ProcessResult struct {
|
|||
FetchErr bool // true if error was during fetch (vs parse)
|
||||
}
|
||||
|
||||
// WriteErrors tracks errors encountered during DB writes.
|
||||
type WriteErrors struct {
|
||||
HostUpdate int
|
||||
IconInsert int
|
||||
}
|
||||
|
||||
// fetchBatch gets the next batch of unparsed hosts after lastID.
|
||||
func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]Host, error) {
|
||||
rows, err := pool.Query(ctx,
|
||||
|
|
@ -58,55 +53,78 @@ func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int
|
|||
return hosts, rows.Err()
|
||||
}
|
||||
|
||||
// writeResult writes parsed results back to the database.
|
||||
// Returns counts of DB write errors encountered.
|
||||
func writeResult(ctx context.Context, pool *pgxpool.Pool, host Host, result ProcessResult, logWriter *LogWriter) WriteErrors {
|
||||
var errs WriteErrors
|
||||
// WorkResult pairs a host with its parsed result for the DB writer.
|
||||
type WorkResult struct {
|
||||
Host Host
|
||||
Result ProcessResult
|
||||
}
|
||||
|
||||
// Update hosts table
|
||||
_, err := pool.Exec(ctx,
|
||||
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
|
||||
nilIfEmpty(result.Title), result.IframeAllowed, host.ID)
|
||||
if err != nil {
|
||||
errs.HostUpdate++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
return errs
|
||||
}
|
||||
// flushResults writes a batch of successful results to the database using pgx.Batch.
|
||||
// Returns the number of DB errors encountered.
|
||||
func flushResults(ctx context.Context, pool *pgxpool.Pool, results []WorkResult, logWriter *LogWriter) int {
|
||||
batch := &pgx.Batch{}
|
||||
|
||||
// Insert /favicon.ico entry
|
||||
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", host.Protocol, host.Hostname)
|
||||
_, err = pool.Exec(ctx,
|
||||
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
|
||||
host.ID, faviconURL)
|
||||
if err != nil {
|
||||
errs.IconInsert++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
// Queue all queries
|
||||
for _, wr := range results {
|
||||
// Update host
|
||||
batch.Queue(
|
||||
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
|
||||
nilIfEmpty(wr.Result.Title), wr.Result.IframeAllowed, wr.Host.ID,
|
||||
)
|
||||
// Insert /favicon.ico
|
||||
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", wr.Host.Protocol, wr.Host.Hostname)
|
||||
batch.Queue(
|
||||
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
|
||||
wr.Host.ID, faviconURL,
|
||||
)
|
||||
// Insert link_rel icons
|
||||
for _, icon := range wr.Result.Icons {
|
||||
batch.Queue(
|
||||
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
|
||||
wr.Host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Insert link rel="icon" entries
|
||||
for _, icon := range result.Icons {
|
||||
_, err = pool.Exec(ctx,
|
||||
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
|
||||
host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes))
|
||||
if err != nil {
|
||||
errs.IconInsert++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||
// Send all queries in one round-trip
|
||||
br := pool.SendBatch(ctx, batch)
|
||||
|
||||
// Check results
|
||||
dbErrors := 0
|
||||
for _, wr := range results {
|
||||
// host update
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", wr.Host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
// favicon.ico insert
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
// link_rel icon inserts
|
||||
for range wr.Result.Icons {
|
||||
if _, err := br.Exec(); err != nil {
|
||||
dbErrors++
|
||||
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", wr.Host.Hostname, err)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return errs
|
||||
br.Close()
|
||||
return dbErrors
|
||||
}
|
||||
|
||||
func nilIfEmpty(s string) *string {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue