From 5b3f6a6870e99cab07a43c6b1f47d87ed279a190 Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Mon, 18 May 2026 12:43:50 -0400 Subject: [PATCH] switched from s3 to disk for saving icons --- infra/main.tf | 2 +- pipeline/03_icon_download/download.go | 14 +++--- pipeline/03_icon_download/main.go | 16 +++---- pipeline/03_icon_download/s3.go | 64 --------------------------- pipeline/03_icon_download/storage.go | 38 ++++++++++++++++ pipeline/05_bundle_gen/bundle.go | 13 +++--- pipeline/05_bundle_gen/convert.go | 9 ++-- pipeline/05_bundle_gen/main.go | 10 ++--- pipeline/05_bundle_gen/s3.go | 30 +++++++------ 9 files changed, 84 insertions(+), 112 deletions(-) delete mode 100644 pipeline/03_icon_download/s3.go create mode 100644 pipeline/03_icon_download/storage.go diff --git a/infra/main.tf b/infra/main.tf index b6f31bc..617e2ea 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -326,7 +326,7 @@ resource "aws_instance" "main" { iam_instance_profile = aws_iam_instance_profile.ec2[0].name root_block_device { - volume_size = 50 + volume_size = 300 volume_type = "gp3" } diff --git a/pipeline/03_icon_download/download.go b/pipeline/03_icon_download/download.go index 16b03b6..5bb6774 100644 --- a/pipeline/03_icon_download/download.go +++ b/pipeline/03_icon_download/download.go @@ -1,7 +1,6 @@ package main import ( - "context" "crypto/sha256" "encoding/hex" "fmt" @@ -12,8 +11,8 @@ import ( "time" ) -// processIcon downloads, validates, and uploads a single icon. -func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult { +// processIcon downloads, validates, and stores a single icon. +func processIcon(icon IconRow, cfg Config) DownloadResult { // Download data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize) if err != nil { @@ -39,15 +38,14 @@ func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult { hash := sha256.Sum256(data) s3Key := hex.EncodeToString(hash[:]) - // Upload to S3 (skip if already exists — dedup) + // Write to disk (skip if already exists — dedup) dedup := false if !cfg.DryRun { - exists, err := s3Exists(ctx, s3Key) - if err == nil && exists { + if iconExists(s3Key) { dedup = true } else { - if err := s3Upload(ctx, s3Key, data, contentType); err != nil { - return DownloadResult{Err: fmt.Sprintf("s3 upload: %v", err), ErrType: "other"} + if err := iconWrite(s3Key, data); err != nil { + return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"} } } } diff --git a/pipeline/03_icon_download/main.go b/pipeline/03_icon_download/main.go index 1f6f52f..b345e1b 100644 --- a/pipeline/03_icon_download/main.go +++ b/pipeline/03_icon_download/main.go @@ -15,7 +15,7 @@ import ( type Config struct { DBUrl string - S3Bucket string + IconsDir string BatchSize int Concurrency int Limit int @@ -45,13 +45,13 @@ type Stats struct { func main() { cfg := Config{} flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)") - flag.StringVar(&cfg.S3Bucket, "s3-bucket", "everytab-icons", "S3 bucket for icons") + flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory to store downloaded icons") flag.IntVar(&cfg.BatchSize, "batch-size", 200, "Rows to claim per batch") flag.IntVar(&cfg.Concurrency, "concurrency", 200, "Number of concurrent goroutines") flag.IntVar(&cfg.Limit, "limit", 0, "Max icons to process (0 = all)") flag.DurationVar(&cfg.Timeout, "timeout", 10*time.Second, "HTTP request timeout") flag.Int64Var(&cfg.MaxSize, "max-size", 512*1024, "Max icon download size in bytes") - flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't upload to S3 or update DB") + flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't write to disk or update DB") flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file") flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file") flag.Parse() @@ -64,9 +64,9 @@ func main() { ctx := context.Background() - // Init S3 - if err := initS3(cfg.S3Bucket); err != nil { - log.Fatalf("Failed to init S3: %v", err) + // Init storage + if err := initStorage(cfg.IconsDir); err != nil { + log.Fatalf("Failed to init storage: %v", err) } // Init DB pool @@ -103,7 +103,7 @@ func main() { fmt.Printf("Concurrency: %d\n", cfg.Concurrency) fmt.Printf("Timeout: %s\n", cfg.Timeout) fmt.Printf("Max size: %dKB\n", cfg.MaxSize/1024) - fmt.Printf("S3 bucket: %s\n", cfg.S3Bucket) + fmt.Printf("Icons dir: %s\n", cfg.IconsDir) fmt.Printf("Dry run: %v\n\n", cfg.DryRun) // Setup log file @@ -168,7 +168,7 @@ func main() { } }() - result := processIcon(ctx, icon, cfg) + result := processIcon(icon, cfg) // Log line logLine := formatLogLine(icon, result) diff --git a/pipeline/03_icon_download/s3.go b/pipeline/03_icon_download/s3.go deleted file mode 100644 index c0a77b7..0000000 --- a/pipeline/03_icon_download/s3.go +++ /dev/null @@ -1,64 +0,0 @@ -package main - -import ( - "bytes" - "context" - "errors" - - "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/aws/aws-sdk-go-v2/service/s3/types" -) - -var ( - s3Client *s3.Client - bucket string -) - -func initS3(bucketName string) error { - cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1")) - if err != nil { - return err - } - s3Client = s3.NewFromConfig(cfg) - bucket = bucketName - return nil -} - -// s3Exists checks if an object already exists in S3 (for dedup). -func s3Exists(ctx context.Context, key string) (bool, error) { - _, err := s3Client.HeadObject(ctx, &s3.HeadObjectInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - }) - if err != nil { - var notFound *types.NotFound - if errors.As(err, ¬Found) { - return false, nil - } - // NoSuchKey error type - var nsk *types.NoSuchKey - if errors.As(err, &nsk) { - return false, nil - } - // Some S3 errors return 404 as a generic error - if ctx.Err() != nil { - return false, ctx.Err() - } - // Treat other errors as "not found" to avoid blocking uploads - return false, nil - } - return true, nil -} - -// s3Upload uploads icon data to S3 with the given key. -func s3Upload(ctx context.Context, key string, data []byte, contentType string) error { - _, err := s3Client.PutObject(ctx, &s3.PutObjectInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - Body: bytes.NewReader(data), - ContentType: aws.String(contentType), - }) - return err -} diff --git a/pipeline/03_icon_download/storage.go b/pipeline/03_icon_download/storage.go new file mode 100644 index 0000000..bed756d --- /dev/null +++ b/pipeline/03_icon_download/storage.go @@ -0,0 +1,38 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" +) + +var iconsDir string + +func initStorage(dir string) error { + iconsDir = dir + return os.MkdirAll(dir, 0755) +} + +// hashToPath converts a SHA-256 hex hash to a sharded file path. +// e.g., "abcdef1234..." → "icons/ab/cd/ef/abcdef1234..." +func hashToPath(hash string) string { + if len(hash) < 6 { + return filepath.Join(iconsDir, hash) + } + return filepath.Join(iconsDir, hash[:2], hash[2:4], hash[4:6], hash) +} + +// iconExists checks if an icon is already stored on disk (for dedup). +func iconExists(hash string) bool { + _, err := os.Stat(hashToPath(hash)) + return err == nil +} + +// iconWrite saves icon data to disk in the sharded directory structure. +func iconWrite(hash string, data []byte) error { + path := hashToPath(hash) + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + return os.WriteFile(path, data, 0644) +} diff --git a/pipeline/05_bundle_gen/bundle.go b/pipeline/05_bundle_gen/bundle.go index 680583a..7dd7693 100644 --- a/pipeline/05_bundle_gen/bundle.go +++ b/pipeline/05_bundle_gen/bundle.go @@ -1,7 +1,6 @@ package main import ( - "context" "encoding/json" "fmt" "os" @@ -24,7 +23,7 @@ type Bundle struct { } // buildEntry creates a BundleEntry for a host, converting its icon if available. -func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter *LogWriter, stats *Stats) BundleEntry { +func buildEntry(host HostRow, iconsDir string, logWriter *LogWriter, stats *Stats) BundleEntry { entry := BundleEntry{ Host: host.Hostname, Title: host.HtmlTitle, @@ -36,7 +35,7 @@ func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter return entry } - encoded, w, h, convertErr := safeConvert(ctx, host.BestIconS3Key, iconsBucket) + encoded, w, h, convertErr := safeConvert(host.BestIconS3Key, iconsDir) if convertErr != "" { stats.ConvertErrors.Add(1) logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr) @@ -54,7 +53,7 @@ func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter } // safeConvert wraps convertIconToBase64PNG with panic recovery. -func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string, w, h int, errMsg string) { +func safeConvert(hash, iconsDir string) (encoded string, w, h int, errMsg string) { defer func() { if r := recover(); r != nil { errMsg = fmt.Sprintf("panic: %v", r) @@ -62,7 +61,7 @@ func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string }() var err error - encoded, w, h, err = convertIconToBase64PNG(ctx, s3Key, iconsBucket) + encoded, w, h, err = convertIconToBase64PNG(hash, iconsDir) if err != nil { return "", 0, 0, err.Error() } @@ -79,7 +78,7 @@ func writeBundleLocal(outputDir string, index int, data []byte) error { return os.WriteFile(path, data, 0644) } -func writeBundleS3(ctx context.Context, bucket string, index int, data []byte) error { +func writeBundleS3(bucket string, index int, data []byte) error { key := fmt.Sprintf("tabs/%04d.json", index) - return s3UploadBundle(ctx, bucket, key, data) + return s3UploadBundle(bucket, key, data) } diff --git a/pipeline/05_bundle_gen/convert.go b/pipeline/05_bundle_gen/convert.go index 09e5746..0b5a33b 100644 --- a/pipeline/05_bundle_gen/convert.go +++ b/pipeline/05_bundle_gen/convert.go @@ -2,7 +2,6 @@ package main import ( "bytes" - "context" "encoding/base64" "fmt" "image" @@ -14,11 +13,11 @@ import ( _ "golang.org/x/image/webp" ) -// convertIconToBase64PNG downloads an icon from S3, converts it to PNG, and returns base64-encoded data. -func convertIconToBase64PNG(ctx context.Context, s3Key string, iconsBucket string) (encoded string, width, height int, err error) { - data, err := s3Download(ctx, iconsBucket, s3Key) +// convertIconToBase64PNG reads an icon from disk, converts it to PNG, and returns base64-encoded data. +func convertIconToBase64PNG(hash string, iconsDir string) (encoded string, width, height int, err error) { + data, err := readIconFromDisk(iconsDir, hash) if err != nil { - return "", 0, 0, fmt.Errorf("s3 download: %w", err) + return "", 0, 0, fmt.Errorf("read icon: %w", err) } // Check for SVG (can't decode to raster without external deps) diff --git a/pipeline/05_bundle_gen/main.go b/pipeline/05_bundle_gen/main.go index 06c531a..7e83541 100644 --- a/pipeline/05_bundle_gen/main.go +++ b/pipeline/05_bundle_gen/main.go @@ -15,7 +15,7 @@ import ( type Config struct { DBUrl string - IconsBucket string + IconsDir string SiteBucket string EntriesPerBundle int Concurrency int @@ -39,7 +39,7 @@ type Stats struct { func main() { cfg := Config{} flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)") - flag.StringVar(&cfg.IconsBucket, "icons-bucket", "everytab-icons", "S3 bucket with downloaded icons") + flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory with downloaded icons") flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site") flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file") flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3") @@ -122,7 +122,7 @@ func main() { go func(idx int, h HostRow) { defer wg.Done() defer func() { <-sem }() - entries[idx] = buildEntry(ctx, h, cfg.IconsBucket, logWriter, stats) + entries[idx] = buildEntry(h, cfg.IconsDir, logWriter, stats) n := processed.Add(1) if n%5000 == 0 { fmt.Printf(" processed %d/%d hosts\n", n, len(hosts)) @@ -134,7 +134,7 @@ func main() { // Clean old bundles before writing new ones (avoids orphans if count changed) if !cfg.DryRun { fmt.Println("\nCleaning old bundles from S3...") - if err := s3DeletePrefix(ctx, cfg.SiteBucket, "tabs/"); err != nil { + if err := s3DeletePrefix(cfg.SiteBucket, "tabs/"); err != nil { log.Fatalf("Failed to clean old bundles: %v", err) } } @@ -160,7 +160,7 @@ func main() { if cfg.DryRun { err = writeBundleLocal(cfg.OutputDir, bundleIndex, data) } else { - err = writeBundleS3(ctx, cfg.SiteBucket, bundleIndex, data) + err = writeBundleS3(cfg.SiteBucket, bundleIndex, data) } if err != nil { log.Fatalf("Failed to write bundle %d: %v", bundleIndex, err) diff --git a/pipeline/05_bundle_gen/s3.go b/pipeline/05_bundle_gen/s3.go index 1e02253..73a2ed4 100644 --- a/pipeline/05_bundle_gen/s3.go +++ b/pipeline/05_bundle_gen/s3.go @@ -4,7 +4,8 @@ import ( "bytes" "context" "fmt" - "io" + "os" + "path/filepath" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" @@ -23,22 +24,22 @@ func initS3() error { return nil } -// s3Download fetches an object from S3. -func s3Download(ctx context.Context, bucket, key string) ([]byte, error) { - resp, err := s3Client.GetObject(ctx, &s3.GetObjectInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - }) - if err != nil { - return nil, err +// iconHashToPath converts a SHA-256 hash to a sharded file path. +func iconHashToPath(iconsDir, hash string) string { + if len(hash) < 6 { + return filepath.Join(iconsDir, hash) } - defer resp.Body.Close() - return io.ReadAll(resp.Body) + return filepath.Join(iconsDir, hash[:2], hash[2:4], hash[4:6], hash) +} + +// readIconFromDisk reads an icon file from the local sharded directory. +func readIconFromDisk(iconsDir, hash string) ([]byte, error) { + return os.ReadFile(iconHashToPath(iconsDir, hash)) } // s3UploadBundle uploads a bundle JSON to S3. -func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error { - _, err := s3Client.PutObject(ctx, &s3.PutObjectInput{ +func s3UploadBundle(bucket, key string, data []byte) error { + _, err := s3Client.PutObject(context.Background(), &s3.PutObjectInput{ Bucket: aws.String(bucket), Key: aws.String(key), Body: bytes.NewReader(data), @@ -48,7 +49,8 @@ func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error } // s3DeletePrefix deletes all objects under a prefix in S3. -func s3DeletePrefix(ctx context.Context, bucket, prefix string) error { +func s3DeletePrefix(bucket, prefix string) error { + ctx := context.Background() paginator := s3.NewListObjectsV2Paginator(s3Client, &s3.ListObjectsV2Input{ Bucket: aws.String(bucket), Prefix: aws.String(prefix),