added timestamps, warc parser library, log files, progress bars, and testing the frontend with real data to the PLAN.md

This commit is contained in:
Joe Lothan 2026-05-17 14:16:56 -04:00
parent c50be97fd7
commit 64ae58494b
2 changed files with 58 additions and 13 deletions

View file

@ -435,12 +435,18 @@ After bundle generation, these are merged into a single `stats.json` uploaded to
"generated_at": "2026-05-17T12:00:00Z",
"pipeline": {
"cc_index": {
"started_at": "2026-05-17T08:00:00Z",
"finished_at": "2026-05-17T08:42:00Z",
"duration_seconds": 2520,
"total_domains": 31245678,
"https": 28901234,
"http_only": 2344444,
"duplicates_removed": 1456789
},
"warc_parse": {
"started_at": "2026-05-17T08:45:00Z",
"finished_at": "2026-05-17T12:15:00Z",
"duration_seconds": 12600,
"processed": 31245678,
"titles_extracted": 29876543,
"icons_found": 45678901,
@ -448,6 +454,9 @@ After bundle generation, these are merged into a single `stats.json` uploaded to
"parse_failures": 234567
},
"icon_download": {
"started_at": "2026-05-17T12:20:00Z",
"finished_at": "2026-05-18T18:30:00Z",
"duration_seconds": 108600,
"attempted": 45678901,
"completed": 38901234,
"failed_dns": 2345678,
@ -459,10 +468,16 @@ After bundle generation, these are merged into a single `stats.json` uploaded to
"dedup_hits": 4333344
},
"best_icon": {
"started_at": "2026-05-18T18:35:00Z",
"finished_at": "2026-05-18T18:40:00Z",
"duration_seconds": 300,
"hosts_with_icon": 27654321,
"hosts_without_icon": 3591357
},
"bundles": {
"started_at": "2026-05-18T18:45:00Z",
"finished_at": "2026-05-18T20:10:00Z",
"duration_seconds": 5100,
"total_bundles": 52341,
"total_hosts_included": 29876543,
"hosts_with_icon": 27654321,