added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
94
pipeline/02_warc_parse/log.go
Normal file
94
pipeline/02_warc_parse/log.go
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LogWriter handles writing log lines to a file.
|
||||
type LogWriter struct {
|
||||
file *os.File
|
||||
mu sync.Mutex
|
||||
errorsOnly bool
|
||||
}
|
||||
|
||||
func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) {
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &LogWriter{file: f, errorsOnly: errorsOnly}, nil
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Write(line string, isError bool) {
|
||||
if lw.errorsOnly && !isError {
|
||||
return
|
||||
}
|
||||
lw.mu.Lock()
|
||||
defer lw.mu.Unlock()
|
||||
fmt.Fprintln(lw.file, line)
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Close() error {
|
||||
return lw.file.Close()
|
||||
}
|
||||
|
||||
// formatLogLine creates a concise one-line log for a processed host.
|
||||
func formatLogLine(host Host, result ProcessResult) string {
|
||||
title := result.Title
|
||||
if len(title) > 20 {
|
||||
title = title[:20] + "..."
|
||||
}
|
||||
|
||||
if result.Err != nil {
|
||||
errType := "parse"
|
||||
if result.FetchErr {
|
||||
errType = "fetch"
|
||||
}
|
||||
return fmt.Sprintf("parsed: %s err:%s %v", host.Hostname, errType, result.Err)
|
||||
}
|
||||
|
||||
iconCount := len(result.Icons) + 1 // +1 for /favicon.ico
|
||||
iframe := "iframe:ok"
|
||||
if !result.IframeAllowed {
|
||||
iframe = "iframe:no"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("parsed: %s \"%s\" icons:%d %s", host.Hostname, title, iconCount, iframe)
|
||||
}
|
||||
|
||||
// writeStats writes the stage stats to a JSON file.
|
||||
func writeStats(stats *Stats, cfg Config) {
|
||||
finishedAt := time.Now()
|
||||
duration := finishedAt.Sub(stats.StartedAt)
|
||||
|
||||
data := map[string]interface{}{
|
||||
"started_at": stats.StartedAt.Format(time.RFC3339),
|
||||
"finished_at": finishedAt.Format(time.RFC3339),
|
||||
"duration_seconds": int(duration.Seconds()),
|
||||
"processed": stats.Processed.Load(),
|
||||
"titles_found": stats.TitlesFound.Load(),
|
||||
"icons_found": stats.IconsFound.Load(),
|
||||
"iframe_blocked": stats.IframeBlocked.Load(),
|
||||
"fetch_errors": stats.FetchErrors.Load(),
|
||||
"parse_errors": stats.ParseErrors.Load(),
|
||||
"db_errors": stats.DBErrors.Load(),
|
||||
"panics": stats.Panics.Load(),
|
||||
}
|
||||
|
||||
os.MkdirAll("stats", 0755)
|
||||
f, err := os.Create("stats/02_warc_parse.json")
|
||||
if err != nil {
|
||||
fmt.Printf("Failed to write stats: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
enc.Encode(data)
|
||||
fmt.Println("Stats written to stats/02_warc_parse.json")
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue