everytab/pipeline/05_bundle_gen/main.go

249 lines
6.8 KiB
Go

package main
import (
"context"
"flag"
"fmt"
"log"
"os"
"sync"
"sync/atomic"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
type Config struct {
DBUrl string
IconsDir string
SiteBucket string
EntriesPerBundle int
Concurrency int
DryRun bool
OutputDir string
Limit int
LogFile string
LogErrors bool
}
type Stats struct {
TotalHosts int
HostsWithIcon int
HostsNoIcon int
BundlesCreated int
ConvertErrors atomic.Int64
TotalBytes int64
StartedAt time.Time
}
func main() {
cfg := Config{}
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory with downloaded icons")
flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site")
flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file")
flag.IntVar(&cfg.Concurrency, "concurrency", 200, "Concurrent icon conversions")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3")
flag.StringVar(&cfg.OutputDir, "output-dir", "bundles", "Local output dir for dry-run mode")
flag.IntVar(&cfg.Limit, "limit", 0, "Max hosts to process (0 = all)")
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
flag.Parse()
if cfg.DBUrl == "" {
fmt.Println("Usage: bundle_gen --db DATABASE_URL [OPTIONS]")
flag.PrintDefaults()
os.Exit(1)
}
ctx := context.Background()
// Init S3 (for uploading bundles)
if err := initS3(); err != nil {
log.Fatalf("Failed to init S3: %v", err)
}
// Init DB
pool, err := pgxpool.New(ctx, cfg.DBUrl)
if err != nil {
log.Fatalf("Failed to connect to database: %v", err)
}
defer pool.Close()
// Setup log file
var logWriter *LogWriter
if cfg.LogFile != "" {
logWriter, err = NewLogWriter(cfg.LogFile, cfg.LogErrors)
if err != nil {
log.Fatalf("Failed to open log file: %v", err)
}
defer logWriter.Close()
}
stats := &Stats{StartedAt: time.Now()}
// Count hosts
fmt.Println("=== Bundle Generator ===")
var totalHosts, hostsWithIcon int
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL`).Scan(&totalHosts)
if err != nil {
log.Fatalf("Failed to count hosts: %v", err)
}
err = pool.QueryRow(ctx, `SELECT COUNT(*) FROM hosts WHERE html_title IS NOT NULL AND best_icon_s3_key IS NOT NULL`).Scan(&hostsWithIcon)
if err != nil {
log.Fatalf("Failed to count icons: %v", err)
}
stats.TotalHosts = totalHosts
stats.HostsWithIcon = hostsWithIcon
stats.HostsNoIcon = totalHosts - hostsWithIcon
fmt.Printf("Total hosts: %d (with icon: %d, no icon: %d)\n", totalHosts, hostsWithIcon, totalHosts-hostsWithIcon)
fmt.Printf("Entries per bundle: %d\n", cfg.EntriesPerBundle)
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
if cfg.DryRun {
os.MkdirAll(cfg.OutputDir, 0755)
}
// Clean old bundles before writing new ones
if !cfg.DryRun {
fmt.Println("Cleaning old bundles from S3...")
if err := s3DeletePrefix(cfg.SiteBucket, "tabs/"); err != nil {
log.Fatalf("Failed to clean old bundles: %v", err)
}
}
// Stream hosts from DB in pages, convert icons, write bundles incrementally
fmt.Println("Processing hosts and writing bundles...")
bundleCount := 0
var totalBytes int64
var lastRandom float64 = -1
pageSize := cfg.EntriesPerBundle * 50 // fetch 50 bundles worth at a time
var entryBuf []BundleEntry
hostsProcessed := 0
for {
// Fetch a page of hosts
limit := pageSize
if cfg.Limit > 0 {
remaining := cfg.Limit - hostsProcessed
if remaining <= 0 {
break
}
if limit > remaining {
limit = remaining
}
}
hosts, err := fetchHostsPage(ctx, pool, lastRandom, limit)
if err != nil {
log.Fatalf("Failed to fetch hosts: %v", err)
}
if len(hosts) == 0 {
break
}
lastRandom = hosts[len(hosts)-1].RandomOrder
hostsProcessed += len(hosts)
// Convert icons concurrently for this page
pageEntries := make([]BundleEntry, len(hosts))
var wg sync.WaitGroup
sem := make(chan struct{}, cfg.Concurrency)
for i, host := range hosts {
wg.Add(1)
sem <- struct{}{}
go func(idx int, h HostRow) {
defer wg.Done()
defer func() { <-sem }()
pageEntries[idx] = buildEntry(h, cfg.IconsDir, logWriter, stats)
}(i, host)
}
wg.Wait()
entryBuf = append(entryBuf, pageEntries...)
// Write complete bundles from the buffer
for len(entryBuf) >= cfg.EntriesPerBundle {
chunk := entryBuf[:cfg.EntriesPerBundle]
entryBuf = entryBuf[cfg.EntriesPerBundle:]
data, err := serializeBundle(chunk)
if err != nil {
log.Fatalf("Failed to serialize bundle %d: %v", bundleCount, err)
}
if cfg.DryRun {
err = writeBundleLocal(cfg.OutputDir, bundleCount, data)
} else {
err = writeBundleS3(cfg.SiteBucket, bundleCount, data)
}
if err != nil {
log.Fatalf("Failed to write bundle %d: %v", bundleCount, err)
}
logLine := fmt.Sprintf("bundle: %04d.json %d entries %dKB", bundleCount, len(chunk), len(data)/1024)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, false)
}
totalBytes += int64(len(data))
bundleCount++
}
}
// Write final partial bundle
if len(entryBuf) > 0 {
data, err := serializeBundle(entryBuf)
if err != nil {
log.Fatalf("Failed to serialize final bundle: %v", err)
}
if cfg.DryRun {
err = writeBundleLocal(cfg.OutputDir, bundleCount, data)
} else {
err = writeBundleS3(cfg.SiteBucket, bundleCount, data)
}
if err != nil {
log.Fatalf("Failed to write final bundle: %v", err)
}
logLine := fmt.Sprintf("bundle: %04d.json %d entries %dKB", bundleCount, len(entryBuf), len(data)/1024)
fmt.Println(logLine)
if logWriter != nil {
logWriter.Write(logLine, false)
}
totalBytes += int64(len(data))
bundleCount++
}
stats.BundlesCreated = bundleCount
stats.TotalBytes = totalBytes
// Summary
duration := time.Since(stats.StartedAt)
fmt.Printf("\n=== Summary ===\n")
fmt.Printf("Duration: %s\n", duration.Round(time.Second))
fmt.Printf("Total hosts: %d\n", stats.TotalHosts)
fmt.Printf("Hosts with icon: %d\n", stats.HostsWithIcon)
fmt.Printf("Hosts without icon: %d\n", stats.HostsNoIcon)
fmt.Printf("Convert errors: %d\n", stats.ConvertErrors.Load())
fmt.Printf("Bundles created: %d\n", stats.BundlesCreated)
fmt.Printf("Total size: %.1f MB\n", float64(stats.TotalBytes)/(1024*1024))
fmt.Printf("Avg bundle size: %.0f KB\n", float64(stats.TotalBytes)/float64(max(stats.BundlesCreated, 1))/1024)
fmt.Printf("TOTAL_BUNDLES = %d (bake this into the frontend)\n", stats.BundlesCreated)
writeStats(stats)
}
func max(a, b int) int {
if a > b {
return a
}
return b
}