everytab/pipeline/02_warc_parse/log.go
2026-05-17 20:25:59 -04:00

94 lines
2.3 KiB
Go

package main
import (
"encoding/json"
"fmt"
"os"
"sync"
"time"
)
// LogWriter handles writing log lines to a file.
type LogWriter struct {
file *os.File
mu sync.Mutex
errorsOnly bool
}
func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) {
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, err
}
return &LogWriter{file: f, errorsOnly: errorsOnly}, nil
}
func (lw *LogWriter) Write(line string, isError bool) {
if lw.errorsOnly && !isError {
return
}
lw.mu.Lock()
defer lw.mu.Unlock()
fmt.Fprintln(lw.file, line)
}
func (lw *LogWriter) Close() error {
return lw.file.Close()
}
// formatLogLine creates a concise one-line log for a processed host.
func formatLogLine(host Host, result ProcessResult) string {
title := result.Title
if len(title) > 20 {
title = title[:20] + "..."
}
if result.Err != nil {
errType := "parse"
if result.FetchErr {
errType = "fetch"
}
return fmt.Sprintf("parsed: %s err:%s %v", host.Hostname, errType, result.Err)
}
iconCount := len(result.Icons) + 1 // +1 for /favicon.ico
iframe := "iframe:ok"
if !result.IframeAllowed {
iframe = "iframe:no"
}
return fmt.Sprintf("parsed: %s \"%s\" icons:%d %s", host.Hostname, title, iconCount, iframe)
}
// writeStats writes the stage stats to a JSON file.
func writeStats(stats *Stats, cfg Config) {
finishedAt := time.Now()
duration := finishedAt.Sub(stats.StartedAt)
data := map[string]interface{}{
"started_at": stats.StartedAt.Format(time.RFC3339),
"finished_at": finishedAt.Format(time.RFC3339),
"duration_seconds": int(duration.Seconds()),
"processed": stats.Processed.Load(),
"titles_found": stats.TitlesFound.Load(),
"icons_found": stats.IconsFound.Load(),
"iframe_blocked": stats.IframeBlocked.Load(),
"fetch_errors": stats.FetchErrors.Load(),
"parse_errors": stats.ParseErrors.Load(),
"db_errors": stats.DBErrors.Load(),
"panics": stats.Panics.Load(),
}
os.MkdirAll("stats", 0755)
f, err := os.Create("stats/02_warc_parse.json")
if err != nil {
fmt.Printf("Failed to write stats: %v\n", err)
return
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
enc.Encode(data)
fmt.Println("Stats written to stats/02_warc_parse.json")
}