diff --git a/pipeline/02_warc_parse/log.go b/pipeline/02_warc_parse/log.go index 9c96a5e..1a47928 100644 --- a/pipeline/02_warc_parse/log.go +++ b/pipeline/02_warc_parse/log.go @@ -71,6 +71,7 @@ func writeStats(stats *Stats, cfg Config) { "duration_seconds": int(duration.Seconds()), "processed": stats.Processed.Load(), "titles_found": stats.TitlesFound.Load(), + "no_title": stats.NoTitle.Load(), "icons_found": stats.IconsFound.Load(), "iframe_blocked": stats.IframeBlocked.Load(), "fetch_errors": stats.FetchErrors.Load(), diff --git a/pipeline/02_warc_parse/main.go b/pipeline/02_warc_parse/main.go index 3300147..ec4f530 100644 --- a/pipeline/02_warc_parse/main.go +++ b/pipeline/02_warc_parse/main.go @@ -26,6 +26,7 @@ type Config struct { type Stats struct { Processed atomic.Int64 TitlesFound atomic.Int64 + NoTitle atomic.Int64 IconsFound atomic.Int64 IframeBlocked atomic.Int64 ParseErrors atomic.Int64 @@ -167,12 +168,17 @@ func main() { // Update stats stats.Processed.Add(1) - if result.Title != "" { - stats.TitlesFound.Add(1) - } - stats.IconsFound.Add(int64(len(result.Icons))) - if result.Err == nil && !result.IframeAllowed { - stats.IframeBlocked.Add(1) + if result.Err == nil { + if result.Title != "" { + stats.TitlesFound.Add(1) + } else { + stats.NoTitle.Add(1) + } + // +1 for the /favicon.ico entry added per host + stats.IconsFound.Add(int64(len(result.Icons) + 1)) + if !result.IframeAllowed { + stats.IframeBlocked.Add(1) + } } if result.Err != nil { if result.FetchErr { @@ -195,6 +201,7 @@ func main() { fmt.Printf("Duration: %s\n", duration.Round(time.Second)) fmt.Printf("Processed: %d\n", stats.Processed.Load()) fmt.Printf("Titles found: %d\n", stats.TitlesFound.Load()) + fmt.Printf("No title: %d\n", stats.NoTitle.Load()) fmt.Printf("Icons found: %d\n", stats.IconsFound.Load()) fmt.Printf("Iframe blocked: %d\n", stats.IframeBlocked.Load()) fmt.Printf("Fetch errors: %d\n", stats.FetchErrors.Load()) diff --git a/pipeline/04_best_icon/select.sql b/pipeline/04_best_icon/select.sql index aa723f8..27ae6b1 100644 --- a/pipeline/04_best_icon/select.sql +++ b/pipeline/04_best_icon/select.sql @@ -37,7 +37,7 @@ FROM ( ) sub WHERE h.id = sub.host_id; --- Stats +-- Stats (human-readable) \echo '--- Best Icon Selection Stats ---' SELECT @@ -47,3 +47,22 @@ SELECT COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) AS title_but_no_icon FROM hosts WHERE parsed = TRUE; + +-- Stats JSON +\! mkdir -p stats +\t on +\a +\o stats/04_best_icon.json +SELECT json_build_object( + 'hosts_with_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NOT NULL), + 'hosts_without_icon', COUNT(*) FILTER (WHERE best_icon_s3_key IS NULL), + 'hosts_with_title', COUNT(*) FILTER (WHERE html_title IS NOT NULL), + 'hosts_no_title', COUNT(*) FILTER (WHERE html_title IS NULL), + 'title_but_no_icon', COUNT(*) FILTER (WHERE html_title IS NOT NULL AND best_icon_s3_key IS NULL) +) +FROM hosts +WHERE parsed = TRUE; +\o +\t off +\a +\echo 'Stats written to stats/04_best_icon.json' diff --git a/pipeline/05_bundle_gen/log.go b/pipeline/05_bundle_gen/log.go index d555a57..fd40a8c 100644 --- a/pipeline/05_bundle_gen/log.go +++ b/pipeline/05_bundle_gen/log.go @@ -43,12 +43,14 @@ func writeStats(stats *Stats) { "started_at": stats.StartedAt.Format(time.RFC3339), "finished_at": finishedAt.Format(time.RFC3339), "duration_seconds": int(duration.Seconds()), - "total_hosts": stats.TotalHosts, - "hosts_with_icon": stats.HostsWithIcon, - "hosts_no_icon": stats.HostsNoIcon, - "convert_errors": stats.ConvertErrors.Load(), - "bundles_created": stats.BundlesCreated, - "total_bytes": stats.TotalBytes, + "total_hosts": stats.TotalHosts, + "hosts_with_icon": stats.HostsWithIcon, + "hosts_no_icon": stats.HostsNoIcon, + "bundled_with_icon": stats.BundledWithIcon.Load(), + "bundled_no_icon": stats.BundledNoIcon.Load(), + "convert_errors": stats.ConvertErrors.Load(), + "bundles_created": stats.BundlesCreated, + "total_bytes": stats.TotalBytes, } os.MkdirAll("stats", 0755) diff --git a/pipeline/05_bundle_gen/main.go b/pipeline/05_bundle_gen/main.go index 08fa0cb..8198eb7 100644 --- a/pipeline/05_bundle_gen/main.go +++ b/pipeline/05_bundle_gen/main.go @@ -27,13 +27,15 @@ type Config struct { } type Stats struct { - TotalHosts int - HostsWithIcon int - HostsNoIcon int - BundlesCreated int - ConvertErrors atomic.Int64 - TotalBytes int64 - StartedAt time.Time + TotalHosts int + HostsWithIcon int + HostsNoIcon int + BundlesCreated int + ConvertErrors atomic.Int64 + BundledWithIcon atomic.Int64 + BundledNoIcon atomic.Int64 + TotalBytes int64 + StartedAt time.Time } func main() { @@ -158,6 +160,14 @@ func main() { } wg.Wait() + for _, e := range pageEntries { + if e.Icon != "" { + stats.BundledWithIcon.Add(1) + } else { + stats.BundledNoIcon.Add(1) + } + } + entryBuf = append(entryBuf, pageEntries...) // Write complete bundles from the buffer @@ -226,6 +236,8 @@ func main() { fmt.Printf("Total hosts: %d\n", stats.TotalHosts) fmt.Printf("Hosts with icon: %d\n", stats.HostsWithIcon) fmt.Printf("Hosts without icon: %d\n", stats.HostsNoIcon) + fmt.Printf("Bundled with icon: %d\n", stats.BundledWithIcon.Load()) + fmt.Printf("Bundled without icon: %d\n", stats.BundledNoIcon.Load()) fmt.Printf("Convert errors: %d\n", stats.ConvertErrors.Load()) fmt.Printf("Bundles created: %d\n", stats.BundlesCreated) fmt.Printf("Total size: %.1f MB\n", float64(stats.TotalBytes)/(1024*1024))