added bundle generation
This commit is contained in:
parent
ca06a91dc6
commit
f89883e745
8 changed files with 536 additions and 0 deletions
85
pipeline/05_bundle_gen/bundle.go
Normal file
85
pipeline/05_bundle_gen/bundle.go
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// BundleEntry is one tab in a bundle JSON file.
|
||||
type BundleEntry struct {
|
||||
Host string `json:"host"`
|
||||
Title string `json:"title"`
|
||||
Icon string `json:"icon"`
|
||||
IconW int `json:"icon_w,omitempty"`
|
||||
IconH int `json:"icon_h,omitempty"`
|
||||
IframeOk bool `json:"iframe_ok"`
|
||||
}
|
||||
|
||||
// Bundle is the top-level JSON structure.
|
||||
type Bundle struct {
|
||||
Entries []BundleEntry `json:"entries"`
|
||||
}
|
||||
|
||||
// buildEntry creates a BundleEntry for a host, converting its icon if available.
|
||||
func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter *LogWriter, stats *Stats) BundleEntry {
|
||||
entry := BundleEntry{
|
||||
Host: host.Hostname,
|
||||
Title: host.HtmlTitle,
|
||||
Icon: "",
|
||||
IframeOk: host.IframeAllowed,
|
||||
}
|
||||
|
||||
if host.BestIconS3Key == "" {
|
||||
return entry
|
||||
}
|
||||
|
||||
encoded, w, h, convertErr := safeConvert(ctx, host.BestIconS3Key, iconsBucket)
|
||||
if convertErr != "" {
|
||||
stats.ConvertErrors.Add(1)
|
||||
logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, true)
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
entry.Icon = encoded
|
||||
entry.IconW = w
|
||||
entry.IconH = h
|
||||
return entry
|
||||
}
|
||||
|
||||
// safeConvert wraps convertIconToBase64PNG with panic recovery.
|
||||
func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string, w, h int, errMsg string) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
errMsg = fmt.Sprintf("panic: %v", r)
|
||||
}
|
||||
}()
|
||||
|
||||
var err error
|
||||
encoded, w, h, err = convertIconToBase64PNG(ctx, s3Key, iconsBucket)
|
||||
if err != nil {
|
||||
return "", 0, 0, err.Error()
|
||||
}
|
||||
return encoded, w, h, ""
|
||||
}
|
||||
|
||||
func serializeBundle(entries []BundleEntry) ([]byte, error) {
|
||||
bundle := Bundle{Entries: entries}
|
||||
return json.Marshal(bundle)
|
||||
}
|
||||
|
||||
func writeBundleLocal(outputDir string, index int, data []byte) error {
|
||||
path := filepath.Join(outputDir, fmt.Sprintf("%04d.json", index))
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
func writeBundleS3(ctx context.Context, bucket string, index int, data []byte) error {
|
||||
key := fmt.Sprintf("tabs/%04d.json", index)
|
||||
return s3UploadBundle(ctx, bucket, key, data)
|
||||
}
|
||||
82
pipeline/05_bundle_gen/convert.go
Normal file
82
pipeline/05_bundle_gen/convert.go
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/png"
|
||||
_ "image/gif"
|
||||
_ "image/jpeg"
|
||||
|
||||
_ "github.com/biessek/golang-ico"
|
||||
_ "golang.org/x/image/webp"
|
||||
)
|
||||
|
||||
// convertIconToBase64PNG downloads an icon from S3, converts it to PNG, and returns base64-encoded data.
|
||||
func convertIconToBase64PNG(ctx context.Context, s3Key string, iconsBucket string) (encoded string, width, height int, err error) {
|
||||
data, err := s3Download(ctx, iconsBucket, s3Key)
|
||||
if err != nil {
|
||||
return "", 0, 0, fmt.Errorf("s3 download: %w", err)
|
||||
}
|
||||
|
||||
// Check for SVG (can't decode to raster without external deps)
|
||||
if isSVG(data) {
|
||||
return "", 0, 0, fmt.Errorf("svg not supported")
|
||||
}
|
||||
|
||||
// image.Decode handles PNG, GIF, JPEG, WebP, BMP, and ICO (via registered decoders)
|
||||
img, _, err := image.Decode(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return "", 0, 0, fmt.Errorf("decode: %w", err)
|
||||
}
|
||||
|
||||
// Downscale icons >128px to 32x32 to keep bundle sizes reasonable
|
||||
bounds := img.Bounds()
|
||||
w, h := bounds.Dx(), bounds.Dy()
|
||||
if w > 128 || h > 128 {
|
||||
img = resizeNearestNeighbor(img, 32, 32)
|
||||
w, h = 32, 32
|
||||
}
|
||||
|
||||
// Re-encode as PNG
|
||||
var pngBuf bytes.Buffer
|
||||
if err := png.Encode(&pngBuf, img); err != nil {
|
||||
return "", 0, 0, fmt.Errorf("png encode: %w", err)
|
||||
}
|
||||
|
||||
encoded = base64.StdEncoding.EncodeToString(pngBuf.Bytes())
|
||||
return encoded, w, h, nil
|
||||
}
|
||||
|
||||
// resizeNearestNeighbor does a simple nearest-neighbor resize.
|
||||
func resizeNearestNeighbor(src image.Image, dstW, dstH int) image.Image {
|
||||
srcBounds := src.Bounds()
|
||||
srcW := srcBounds.Dx()
|
||||
srcH := srcBounds.Dy()
|
||||
dst := image.NewRGBA(image.Rect(0, 0, dstW, dstH))
|
||||
for y := 0; y < dstH; y++ {
|
||||
srcY := srcBounds.Min.Y + y*srcH/dstH
|
||||
for x := 0; x < dstW; x++ {
|
||||
srcX := srcBounds.Min.X + x*srcW/dstW
|
||||
dst.Set(x, y, src.At(srcX, srcY))
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
func isSVG(data []byte) bool {
|
||||
if len(data) < 5 {
|
||||
return false
|
||||
}
|
||||
header := data[:min(256, len(data))]
|
||||
return bytes.Contains(header, []byte("<svg")) || bytes.Contains(header, []byte("<?xml"))
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
63
pipeline/05_bundle_gen/db.go
Normal file
63
pipeline/05_bundle_gen/db.go
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
type HostRow struct {
|
||||
Hostname string
|
||||
Protocol string
|
||||
HtmlTitle string
|
||||
IframeAllowed bool
|
||||
BestIconS3Key string
|
||||
}
|
||||
|
||||
// fetchHosts gets all hosts with titles, randomized order.
|
||||
func fetchHosts(ctx context.Context, pool *pgxpool.Pool, limit int) ([]HostRow, error) {
|
||||
query := `
|
||||
SELECT hostname, protocol, html_title, COALESCE(iframe_allowed, true), COALESCE(best_icon_s3_key, '')
|
||||
FROM hosts
|
||||
WHERE html_title IS NOT NULL
|
||||
ORDER BY random()
|
||||
`
|
||||
if limit > 0 {
|
||||
query += " LIMIT $1"
|
||||
}
|
||||
|
||||
var rows interface{ Query(context.Context, string, ...interface{}) (interface{ Close(); Next() bool; Scan(...interface{}) error; Err() error }, error) }
|
||||
_ = rows // unused, using pool directly
|
||||
|
||||
var hosts []HostRow
|
||||
|
||||
if limit > 0 {
|
||||
pgRows, err := pool.Query(ctx, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer pgRows.Close()
|
||||
for pgRows.Next() {
|
||||
var h HostRow
|
||||
if err := pgRows.Scan(&h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
hosts = append(hosts, h)
|
||||
}
|
||||
return hosts, pgRows.Err()
|
||||
}
|
||||
|
||||
pgRows, err := pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer pgRows.Close()
|
||||
for pgRows.Next() {
|
||||
var h HostRow
|
||||
if err := pgRows.Scan(&h.Hostname, &h.Protocol, &h.HtmlTitle, &h.IframeAllowed, &h.BestIconS3Key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
hosts = append(hosts, h)
|
||||
}
|
||||
return hosts, pgRows.Err()
|
||||
}
|
||||
66
pipeline/05_bundle_gen/log.go
Normal file
66
pipeline/05_bundle_gen/log.go
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type LogWriter struct {
|
||||
file *os.File
|
||||
mu sync.Mutex
|
||||
errorsOnly bool
|
||||
}
|
||||
|
||||
func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) {
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &LogWriter{file: f, errorsOnly: errorsOnly}, nil
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Write(line string, isError bool) {
|
||||
if lw.errorsOnly && !isError {
|
||||
return
|
||||
}
|
||||
lw.mu.Lock()
|
||||
defer lw.mu.Unlock()
|
||||
fmt.Fprintln(lw.file, line)
|
||||
}
|
||||
|
||||
func (lw *LogWriter) Close() error {
|
||||
return lw.file.Close()
|
||||
}
|
||||
|
||||
func writeStats(stats *Stats) {
|
||||
finishedAt := time.Now()
|
||||
duration := finishedAt.Sub(stats.StartedAt)
|
||||
|
||||
data := map[string]interface{}{
|
||||
"started_at": stats.StartedAt.Format(time.RFC3339),
|
||||
"finished_at": finishedAt.Format(time.RFC3339),
|
||||
"duration_seconds": int(duration.Seconds()),
|
||||
"total_hosts": stats.TotalHosts,
|
||||
"hosts_with_icon": stats.HostsWithIcon,
|
||||
"hosts_no_icon": stats.HostsNoIcon,
|
||||
"convert_errors": stats.ConvertErrors.Load(),
|
||||
"bundles_created": stats.BundlesCreated,
|
||||
"total_bytes": stats.TotalBytes,
|
||||
}
|
||||
|
||||
os.MkdirAll("stats", 0755)
|
||||
f, err := os.Create("stats/05_bundle_gen.json")
|
||||
if err != nil {
|
||||
fmt.Printf("Failed to write stats: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
enc.Encode(data)
|
||||
fmt.Println("Stats written to stats/05_bundle_gen.json")
|
||||
}
|
||||
188
pipeline/05_bundle_gen/main.go
Normal file
188
pipeline/05_bundle_gen/main.go
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
DBUrl string
|
||||
IconsBucket string
|
||||
SiteBucket string
|
||||
EntriesPerBundle int
|
||||
Concurrency int
|
||||
DryRun bool
|
||||
OutputDir string
|
||||
Limit int
|
||||
LogFile string
|
||||
LogErrors bool
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
TotalHosts int
|
||||
HostsWithIcon int
|
||||
HostsNoIcon int
|
||||
BundlesCreated int
|
||||
ConvertErrors atomic.Int64
|
||||
TotalBytes int64
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
func main() {
|
||||
cfg := Config{}
|
||||
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
|
||||
flag.StringVar(&cfg.IconsBucket, "icons-bucket", "everytab-icons", "S3 bucket with downloaded icons")
|
||||
flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site")
|
||||
flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3")
|
||||
flag.StringVar(&cfg.OutputDir, "output-dir", "bundles", "Local output dir for dry-run mode")
|
||||
flag.IntVar(&cfg.Limit, "limit", 0, "Max hosts to process (0 = all)")
|
||||
flag.IntVar(&cfg.Concurrency, "concurrency", 50, "Concurrent icon conversions")
|
||||
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
|
||||
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
|
||||
flag.Parse()
|
||||
|
||||
if cfg.DBUrl == "" {
|
||||
fmt.Println("Usage: bundle_gen --db DATABASE_URL [OPTIONS]")
|
||||
flag.PrintDefaults()
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Init S3
|
||||
if err := initS3(); err != nil {
|
||||
log.Fatalf("Failed to init S3: %v", err)
|
||||
}
|
||||
|
||||
// Init DB
|
||||
pool, err := pgxpool.New(ctx, cfg.DBUrl)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
defer pool.Close()
|
||||
|
||||
// Setup log file
|
||||
var logWriter *LogWriter
|
||||
if cfg.LogFile != "" {
|
||||
logWriter, err = NewLogWriter(cfg.LogFile, cfg.LogErrors)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to open log file: %v", err)
|
||||
}
|
||||
defer logWriter.Close()
|
||||
}
|
||||
|
||||
stats := &Stats{StartedAt: time.Now()}
|
||||
|
||||
// Fetch all qualifying hosts (randomized)
|
||||
fmt.Println("=== Bundle Generator ===")
|
||||
fmt.Println("Querying hosts...")
|
||||
|
||||
hosts, err := fetchHosts(ctx, pool, cfg.Limit)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to fetch hosts: %v", err)
|
||||
}
|
||||
|
||||
stats.TotalHosts = len(hosts)
|
||||
for _, h := range hosts {
|
||||
if h.BestIconS3Key != "" {
|
||||
stats.HostsWithIcon++
|
||||
} else {
|
||||
stats.HostsNoIcon++
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Total hosts: %d (with icon: %d, no icon: %d)\n", stats.TotalHosts, stats.HostsWithIcon, stats.HostsNoIcon)
|
||||
fmt.Printf("Entries per bundle: %d\n", cfg.EntriesPerBundle)
|
||||
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
|
||||
|
||||
if cfg.DryRun {
|
||||
os.MkdirAll(cfg.OutputDir, 0755)
|
||||
}
|
||||
|
||||
// Process hosts into bundle entries (concurrently for S3 downloads)
|
||||
fmt.Printf("Converting icons and building entries (concurrency: %d)...\n", cfg.Concurrency)
|
||||
entries := make([]BundleEntry, len(hosts))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
sem := make(chan struct{}, cfg.Concurrency)
|
||||
var processed atomic.Int64
|
||||
|
||||
for i, host := range hosts {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(idx int, h HostRow) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
entries[idx] = buildEntry(ctx, h, cfg.IconsBucket, logWriter, stats)
|
||||
n := processed.Add(1)
|
||||
if n%5000 == 0 {
|
||||
fmt.Printf(" processed %d/%d hosts\n", n, len(hosts))
|
||||
}
|
||||
}(i, host)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Chunk into bundles and write
|
||||
fmt.Println("\nWriting bundles...")
|
||||
bundleCount := 0
|
||||
var totalBytes int64
|
||||
|
||||
for i := 0; i < len(entries); i += cfg.EntriesPerBundle {
|
||||
end := i + cfg.EntriesPerBundle
|
||||
if end > len(entries) {
|
||||
end = len(entries)
|
||||
}
|
||||
|
||||
chunk := entries[i:end]
|
||||
bundleIndex := bundleCount
|
||||
data, err := serializeBundle(chunk)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to serialize bundle %d: %v", bundleIndex, err)
|
||||
}
|
||||
|
||||
if cfg.DryRun {
|
||||
err = writeBundleLocal(cfg.OutputDir, bundleIndex, data)
|
||||
} else {
|
||||
err = writeBundleS3(ctx, cfg.SiteBucket, bundleIndex, data)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to write bundle %d: %v", bundleIndex, err)
|
||||
}
|
||||
|
||||
logLine := fmt.Sprintf("bundle: %04d.json %d entries %dKB", bundleIndex, len(chunk), len(data)/1024)
|
||||
fmt.Println(logLine)
|
||||
if logWriter != nil {
|
||||
logWriter.Write(logLine, false)
|
||||
}
|
||||
|
||||
totalBytes += int64(len(data))
|
||||
bundleCount++
|
||||
}
|
||||
|
||||
stats.BundlesCreated = bundleCount
|
||||
stats.TotalBytes = totalBytes
|
||||
|
||||
// Summary
|
||||
duration := time.Since(stats.StartedAt)
|
||||
fmt.Printf("\n=== Summary ===\n")
|
||||
fmt.Printf("Duration: %s\n", duration.Round(time.Second))
|
||||
fmt.Printf("Total hosts: %d\n", stats.TotalHosts)
|
||||
fmt.Printf("Hosts with icon: %d\n", stats.HostsWithIcon)
|
||||
fmt.Printf("Hosts without icon: %d\n", stats.HostsNoIcon)
|
||||
fmt.Printf("Convert errors: %d\n", stats.ConvertErrors.Load())
|
||||
fmt.Printf("Bundles created: %d\n", stats.BundlesCreated)
|
||||
fmt.Printf("Total size: %.1f MB\n", float64(stats.TotalBytes)/(1024*1024))
|
||||
fmt.Printf("Avg bundle size: %.0f KB\n", float64(stats.TotalBytes)/float64(stats.BundlesCreated)/1024)
|
||||
fmt.Printf("TOTAL_BUNDLES = %d (bake this into the frontend)\n", stats.BundlesCreated)
|
||||
|
||||
writeStats(stats)
|
||||
}
|
||||
46
pipeline/05_bundle_gen/s3.go
Normal file
46
pipeline/05_bundle_gen/s3.go
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"io"
|
||||
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/config"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
)
|
||||
|
||||
var s3Client *s3.Client
|
||||
|
||||
func initS3() error {
|
||||
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s3Client = s3.NewFromConfig(cfg)
|
||||
return nil
|
||||
}
|
||||
|
||||
// s3Download fetches an object from S3.
|
||||
func s3Download(ctx context.Context, bucket, key string) ([]byte, error) {
|
||||
resp, err := s3Client.GetObject(ctx, &s3.GetObjectInput{
|
||||
Bucket: aws.String(bucket),
|
||||
Key: aws.String(key),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
return io.ReadAll(resp.Body)
|
||||
}
|
||||
|
||||
// s3UploadBundle uploads a bundle JSON to S3.
|
||||
func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error {
|
||||
_, err := s3Client.PutObject(ctx, &s3.PutObjectInput{
|
||||
Bucket: aws.String(bucket),
|
||||
Key: aws.String(key),
|
||||
Body: bytes.NewReader(data),
|
||||
ContentType: aws.String("application/json"),
|
||||
})
|
||||
return err
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue