diff --git a/go.mod b/go.mod index 1e64f56..da33377 100644 --- a/go.mod +++ b/go.mod @@ -21,12 +21,14 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect github.com/aws/smithy-go v1.25.1 // indirect + github.com/biessek/golang-ico v0.0.0-20250805151044-6d8ea19fb761 // indirect github.com/bits-and-blooms/bitset v1.24.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgx/v5 v5.9.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/jsummers/gobmp v0.0.0-20230614200233-a9de23ed2e25 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/nlnwa/gowarc/v3 v3.1.0 // indirect diff --git a/go.sum b/go.sum index c0b1ae2..574fbc5 100644 --- a/go.sum +++ b/go.sum @@ -34,6 +34,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOIt github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio= github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI= github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= +github.com/biessek/golang-ico v0.0.0-20250805151044-6d8ea19fb761 h1:7TVpSKu1j0y3bckgvUhzW88Tt5HlovF+8U9gP2TVzzo= +github.com/biessek/golang-ico v0.0.0-20250805151044-6d8ea19fb761/go.mod h1:iRWAFbKXMMkVQyxZ1PfGlkBr1TjATx1zy2MRprV7A3Q= github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.24.0 h1:H4x4TuulnokZKvHLfzVRTHJfFfnHEeSYJizujEZvmAM= github.com/bits-and-blooms/bitset v1.24.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= @@ -49,6 +51,8 @@ github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw= github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/jsummers/gobmp v0.0.0-20230614200233-a9de23ed2e25 h1:YLvr1eE6cdCqjOe972w/cYF+FjW34v27+9Vo5106B4M= +github.com/jsummers/gobmp v0.0.0-20230614200233-a9de23ed2e25/go.mod h1:kLgvv7o6UM+0QSf0QjAse3wReFDsb9qbZJdfexWlrQw= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= diff --git a/pipeline/05_bundle_gen/bundle.go b/pipeline/05_bundle_gen/bundle.go new file mode 100644 index 0000000..680583a --- /dev/null +++ b/pipeline/05_bundle_gen/bundle.go @@ -0,0 +1,85 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" +) + +// BundleEntry is one tab in a bundle JSON file. +type BundleEntry struct { + Host string `json:"host"` + Title string `json:"title"` + Icon string `json:"icon"` + IconW int `json:"icon_w,omitempty"` + IconH int `json:"icon_h,omitempty"` + IframeOk bool `json:"iframe_ok"` +} + +// Bundle is the top-level JSON structure. +type Bundle struct { + Entries []BundleEntry `json:"entries"` +} + +// buildEntry creates a BundleEntry for a host, converting its icon if available. +func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter *LogWriter, stats *Stats) BundleEntry { + entry := BundleEntry{ + Host: host.Hostname, + Title: host.HtmlTitle, + Icon: "", + IframeOk: host.IframeAllowed, + } + + if host.BestIconS3Key == "" { + return entry + } + + encoded, w, h, convertErr := safeConvert(ctx, host.BestIconS3Key, iconsBucket) + if convertErr != "" { + stats.ConvertErrors.Add(1) + logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr) + fmt.Println(logLine) + if logWriter != nil { + logWriter.Write(logLine, true) + } + return entry + } + + entry.Icon = encoded + entry.IconW = w + entry.IconH = h + return entry +} + +// safeConvert wraps convertIconToBase64PNG with panic recovery. +func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string, w, h int, errMsg string) { + defer func() { + if r := recover(); r != nil { + errMsg = fmt.Sprintf("panic: %v", r) + } + }() + + var err error + encoded, w, h, err = convertIconToBase64PNG(ctx, s3Key, iconsBucket) + if err != nil { + return "", 0, 0, err.Error() + } + return encoded, w, h, "" +} + +func serializeBundle(entries []BundleEntry) ([]byte, error) { + bundle := Bundle{Entries: entries} + return json.Marshal(bundle) +} + +func writeBundleLocal(outputDir string, index int, data []byte) error { + path := filepath.Join(outputDir, fmt.Sprintf("%04d.json", index)) + return os.WriteFile(path, data, 0644) +} + +func writeBundleS3(ctx context.Context, bucket string, index int, data []byte) error { + key := fmt.Sprintf("tabs/%04d.json", index) + return s3UploadBundle(ctx, bucket, key, data) +} diff --git a/pipeline/05_bundle_gen/convert.go b/pipeline/05_bundle_gen/convert.go new file mode 100644 index 0000000..09e5746 --- /dev/null +++ b/pipeline/05_bundle_gen/convert.go @@ -0,0 +1,82 @@ +package main + +import ( + "bytes" + "context" + "encoding/base64" + "fmt" + "image" + "image/png" + _ "image/gif" + _ "image/jpeg" + + _ "github.com/biessek/golang-ico" + _ "golang.org/x/image/webp" +) + +// convertIconToBase64PNG downloads an icon from S3, converts it to PNG, and returns base64-encoded data. +func convertIconToBase64PNG(ctx context.Context, s3Key string, iconsBucket string) (encoded string, width, height int, err error) { + data, err := s3Download(ctx, iconsBucket, s3Key) + if err != nil { + return "", 0, 0, fmt.Errorf("s3 download: %w", err) + } + + // Check for SVG (can't decode to raster without external deps) + if isSVG(data) { + return "", 0, 0, fmt.Errorf("svg not supported") + } + + // image.Decode handles PNG, GIF, JPEG, WebP, BMP, and ICO (via registered decoders) + img, _, err := image.Decode(bytes.NewReader(data)) + if err != nil { + return "", 0, 0, fmt.Errorf("decode: %w", err) + } + + // Downscale icons >128px to 32x32 to keep bundle sizes reasonable + bounds := img.Bounds() + w, h := bounds.Dx(), bounds.Dy() + if w > 128 || h > 128 { + img = resizeNearestNeighbor(img, 32, 32) + w, h = 32, 32 + } + + // Re-encode as PNG + var pngBuf bytes.Buffer + if err := png.Encode(&pngBuf, img); err != nil { + return "", 0, 0, fmt.Errorf("png encode: %w", err) + } + + encoded = base64.StdEncoding.EncodeToString(pngBuf.Bytes()) + return encoded, w, h, nil +} + +// resizeNearestNeighbor does a simple nearest-neighbor resize. +func resizeNearestNeighbor(src image.Image, dstW, dstH int) image.Image { + srcBounds := src.Bounds() + srcW := srcBounds.Dx() + srcH := srcBounds.Dy() + dst := image.NewRGBA(image.Rect(0, 0, dstW, dstH)) + for y := 0; y < dstH; y++ { + srcY := srcBounds.Min.Y + y*srcH/dstH + for x := 0; x < dstW; x++ { + srcX := srcBounds.Min.X + x*srcW/dstW + dst.Set(x, y, src.At(srcX, srcY)) + } + } + return dst +} + +func isSVG(data []byte) bool { + if len(data) < 5 { + return false + } + header := data[:min(256, len(data))] + return bytes.Contains(header, []byte(" 0 { + query += " LIMIT $1" + } + + var rows interface{ Query(context.Context, string, ...interface{}) (interface{ Close(); Next() bool; Scan(...interface{}) error; Err() error }, error) } + _ = rows // unused, using pool directly + + var hosts []HostRow + + if limit > 0 { + pgRows, err := pool.Query(ctx, query, limit) + if err != nil { + return nil, err + } + defer pgRows.Close() + for pgRows.Next() { + var h HostRow + if err := pgRows.Scan(&h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key); err != nil { + return nil, err + } + hosts = append(hosts, h) + } + return hosts, pgRows.Err() + } + + pgRows, err := pool.Query(ctx, query) + if err != nil { + return nil, err + } + defer pgRows.Close() + for pgRows.Next() { + var h HostRow + if err := pgRows.Scan(&h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key); err != nil { + return nil, err + } + hosts = append(hosts, h) + } + return hosts, pgRows.Err() +} diff --git a/pipeline/05_bundle_gen/log.go b/pipeline/05_bundle_gen/log.go new file mode 100644 index 0000000..d555a57 --- /dev/null +++ b/pipeline/05_bundle_gen/log.go @@ -0,0 +1,66 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "sync" + "time" +) + +type LogWriter struct { + file *os.File + mu sync.Mutex + errorsOnly bool +} + +func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) { + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, err + } + return &LogWriter{file: f, errorsOnly: errorsOnly}, nil +} + +func (lw *LogWriter) Write(line string, isError bool) { + if lw.errorsOnly && !isError { + return + } + lw.mu.Lock() + defer lw.mu.Unlock() + fmt.Fprintln(lw.file, line) +} + +func (lw *LogWriter) Close() error { + return lw.file.Close() +} + +func writeStats(stats *Stats) { + finishedAt := time.Now() + duration := finishedAt.Sub(stats.StartedAt) + + data := map[string]interface{}{ + "started_at": stats.StartedAt.Format(time.RFC3339), + "finished_at": finishedAt.Format(time.RFC3339), + "duration_seconds": int(duration.Seconds()), + "total_hosts": stats.TotalHosts, + "hosts_with_icon": stats.HostsWithIcon, + "hosts_no_icon": stats.HostsNoIcon, + "convert_errors": stats.ConvertErrors.Load(), + "bundles_created": stats.BundlesCreated, + "total_bytes": stats.TotalBytes, + } + + os.MkdirAll("stats", 0755) + f, err := os.Create("stats/05_bundle_gen.json") + if err != nil { + fmt.Printf("Failed to write stats: %v\n", err) + return + } + defer f.Close() + + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + enc.Encode(data) + fmt.Println("Stats written to stats/05_bundle_gen.json") +} diff --git a/pipeline/05_bundle_gen/main.go b/pipeline/05_bundle_gen/main.go new file mode 100644 index 0000000..f7525ca --- /dev/null +++ b/pipeline/05_bundle_gen/main.go @@ -0,0 +1,188 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log" + "os" + "sync" + "sync/atomic" + "time" + + "github.com/jackc/pgx/v5/pgxpool" +) + +type Config struct { + DBUrl string + IconsBucket string + SiteBucket string + EntriesPerBundle int + Concurrency int + DryRun bool + OutputDir string + Limit int + LogFile string + LogErrors bool +} + +type Stats struct { + TotalHosts int + HostsWithIcon int + HostsNoIcon int + BundlesCreated int + ConvertErrors atomic.Int64 + TotalBytes int64 + StartedAt time.Time +} + +func main() { + cfg := Config{} + flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)") + flag.StringVar(&cfg.IconsBucket, "icons-bucket", "everytab-icons", "S3 bucket with downloaded icons") + flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site") + flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file") + flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3") + flag.StringVar(&cfg.OutputDir, "output-dir", "bundles", "Local output dir for dry-run mode") + flag.IntVar(&cfg.Limit, "limit", 0, "Max hosts to process (0 = all)") + flag.IntVar(&cfg.Concurrency, "concurrency", 50, "Concurrent icon conversions") + flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file") + flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file") + flag.Parse() + + if cfg.DBUrl == "" { + fmt.Println("Usage: bundle_gen --db DATABASE_URL [OPTIONS]") + flag.PrintDefaults() + os.Exit(1) + } + + ctx := context.Background() + + // Init S3 + if err := initS3(); err != nil { + log.Fatalf("Failed to init S3: %v", err) + } + + // Init DB + pool, err := pgxpool.New(ctx, cfg.DBUrl) + if err != nil { + log.Fatalf("Failed to connect to database: %v", err) + } + defer pool.Close() + + // Setup log file + var logWriter *LogWriter + if cfg.LogFile != "" { + logWriter, err = NewLogWriter(cfg.LogFile, cfg.LogErrors) + if err != nil { + log.Fatalf("Failed to open log file: %v", err) + } + defer logWriter.Close() + } + + stats := &Stats{StartedAt: time.Now()} + + // Fetch all qualifying hosts (randomized) + fmt.Println("=== Bundle Generator ===") + fmt.Println("Querying hosts...") + + hosts, err := fetchHosts(ctx, pool, cfg.Limit) + if err != nil { + log.Fatalf("Failed to fetch hosts: %v", err) + } + + stats.TotalHosts = len(hosts) + for _, h := range hosts { + if h.BestIconS3Key != "" { + stats.HostsWithIcon++ + } else { + stats.HostsNoIcon++ + } + } + + fmt.Printf("Total hosts: %d (with icon: %d, no icon: %d)\n", stats.TotalHosts, stats.HostsWithIcon, stats.HostsNoIcon) + fmt.Printf("Entries per bundle: %d\n", cfg.EntriesPerBundle) + fmt.Printf("Dry run: %v\n\n", cfg.DryRun) + + if cfg.DryRun { + os.MkdirAll(cfg.OutputDir, 0755) + } + + // Process hosts into bundle entries (concurrently for S3 downloads) + fmt.Printf("Converting icons and building entries (concurrency: %d)...\n", cfg.Concurrency) + entries := make([]BundleEntry, len(hosts)) + + var wg sync.WaitGroup + sem := make(chan struct{}, cfg.Concurrency) + var processed atomic.Int64 + + for i, host := range hosts { + wg.Add(1) + sem <- struct{}{} + go func(idx int, h HostRow) { + defer wg.Done() + defer func() { <-sem }() + entries[idx] = buildEntry(ctx, h, cfg.IconsBucket, logWriter, stats) + n := processed.Add(1) + if n%5000 == 0 { + fmt.Printf(" processed %d/%d hosts\n", n, len(hosts)) + } + }(i, host) + } + wg.Wait() + + // Chunk into bundles and write + fmt.Println("\nWriting bundles...") + bundleCount := 0 + var totalBytes int64 + + for i := 0; i < len(entries); i += cfg.EntriesPerBundle { + end := i + cfg.EntriesPerBundle + if end > len(entries) { + end = len(entries) + } + + chunk := entries[i:end] + bundleIndex := bundleCount + data, err := serializeBundle(chunk) + if err != nil { + log.Fatalf("Failed to serialize bundle %d: %v", bundleIndex, err) + } + + if cfg.DryRun { + err = writeBundleLocal(cfg.OutputDir, bundleIndex, data) + } else { + err = writeBundleS3(ctx, cfg.SiteBucket, bundleIndex, data) + } + if err != nil { + log.Fatalf("Failed to write bundle %d: %v", bundleIndex, err) + } + + logLine := fmt.Sprintf("bundle: %04d.json %d entries %dKB", bundleIndex, len(chunk), len(data)/1024) + fmt.Println(logLine) + if logWriter != nil { + logWriter.Write(logLine, false) + } + + totalBytes += int64(len(data)) + bundleCount++ + } + + stats.BundlesCreated = bundleCount + stats.TotalBytes = totalBytes + + // Summary + duration := time.Since(stats.StartedAt) + fmt.Printf("\n=== Summary ===\n") + fmt.Printf("Duration: %s\n", duration.Round(time.Second)) + fmt.Printf("Total hosts: %d\n", stats.TotalHosts) + fmt.Printf("Hosts with icon: %d\n", stats.HostsWithIcon) + fmt.Printf("Hosts without icon: %d\n", stats.HostsNoIcon) + fmt.Printf("Convert errors: %d\n", stats.ConvertErrors.Load()) + fmt.Printf("Bundles created: %d\n", stats.BundlesCreated) + fmt.Printf("Total size: %.1f MB\n", float64(stats.TotalBytes)/(1024*1024)) + fmt.Printf("Avg bundle size: %.0f KB\n", float64(stats.TotalBytes)/float64(stats.BundlesCreated)/1024) + fmt.Printf("TOTAL_BUNDLES = %d (bake this into the frontend)\n", stats.BundlesCreated) + + writeStats(stats) +} diff --git a/pipeline/05_bundle_gen/s3.go b/pipeline/05_bundle_gen/s3.go new file mode 100644 index 0000000..734f7ef --- /dev/null +++ b/pipeline/05_bundle_gen/s3.go @@ -0,0 +1,46 @@ +package main + +import ( + "bytes" + "context" + "io" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +var s3Client *s3.Client + +func initS3() error { + cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1")) + if err != nil { + return err + } + s3Client = s3.NewFromConfig(cfg) + return nil +} + +// s3Download fetches an object from S3. +func s3Download(ctx context.Context, bucket, key string) ([]byte, error) { + resp, err := s3Client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + }) + if err != nil { + return nil, err + } + defer resp.Body.Close() + return io.ReadAll(resp.Body) +} + +// s3UploadBundle uploads a bundle JSON to S3. +func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error { + _, err := s3Client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + Body: bytes.NewReader(data), + ContentType: aws.String("application/json"), + }) + return err +}