switched from s3 to disk for saving icons

This commit is contained in:
Joe Lothan 2026-05-18 12:43:50 -04:00
parent 113a261dae
commit 5b3f6a6870
9 changed files with 84 additions and 112 deletions

View file

@ -326,7 +326,7 @@ resource "aws_instance" "main" {
iam_instance_profile = aws_iam_instance_profile.ec2[0].name iam_instance_profile = aws_iam_instance_profile.ec2[0].name
root_block_device { root_block_device {
volume_size = 50 volume_size = 300
volume_type = "gp3" volume_type = "gp3"
} }

View file

@ -1,7 +1,6 @@
package main package main
import ( import (
"context"
"crypto/sha256" "crypto/sha256"
"encoding/hex" "encoding/hex"
"fmt" "fmt"
@ -12,8 +11,8 @@ import (
"time" "time"
) )
// processIcon downloads, validates, and uploads a single icon. // processIcon downloads, validates, and stores a single icon.
func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult { func processIcon(icon IconRow, cfg Config) DownloadResult {
// Download // Download
data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize) data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize)
if err != nil { if err != nil {
@ -39,15 +38,14 @@ func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult {
hash := sha256.Sum256(data) hash := sha256.Sum256(data)
s3Key := hex.EncodeToString(hash[:]) s3Key := hex.EncodeToString(hash[:])
// Upload to S3 (skip if already exists — dedup) // Write to disk (skip if already exists — dedup)
dedup := false dedup := false
if !cfg.DryRun { if !cfg.DryRun {
exists, err := s3Exists(ctx, s3Key) if iconExists(s3Key) {
if err == nil && exists {
dedup = true dedup = true
} else { } else {
if err := s3Upload(ctx, s3Key, data, contentType); err != nil { if err := iconWrite(s3Key, data); err != nil {
return DownloadResult{Err: fmt.Sprintf("s3 upload: %v", err), ErrType: "other"} return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"}
} }
} }
} }

View file

@ -15,7 +15,7 @@ import (
type Config struct { type Config struct {
DBUrl string DBUrl string
S3Bucket string IconsDir string
BatchSize int BatchSize int
Concurrency int Concurrency int
Limit int Limit int
@ -45,13 +45,13 @@ type Stats struct {
func main() { func main() {
cfg := Config{} cfg := Config{}
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)") flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
flag.StringVar(&cfg.S3Bucket, "s3-bucket", "everytab-icons", "S3 bucket for icons") flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory to store downloaded icons")
flag.IntVar(&cfg.BatchSize, "batch-size", 200, "Rows to claim per batch") flag.IntVar(&cfg.BatchSize, "batch-size", 200, "Rows to claim per batch")
flag.IntVar(&cfg.Concurrency, "concurrency", 200, "Number of concurrent goroutines") flag.IntVar(&cfg.Concurrency, "concurrency", 200, "Number of concurrent goroutines")
flag.IntVar(&cfg.Limit, "limit", 0, "Max icons to process (0 = all)") flag.IntVar(&cfg.Limit, "limit", 0, "Max icons to process (0 = all)")
flag.DurationVar(&cfg.Timeout, "timeout", 10*time.Second, "HTTP request timeout") flag.DurationVar(&cfg.Timeout, "timeout", 10*time.Second, "HTTP request timeout")
flag.Int64Var(&cfg.MaxSize, "max-size", 512*1024, "Max icon download size in bytes") flag.Int64Var(&cfg.MaxSize, "max-size", 512*1024, "Max icon download size in bytes")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't upload to S3 or update DB") flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't write to disk or update DB")
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file") flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file") flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
flag.Parse() flag.Parse()
@ -64,9 +64,9 @@ func main() {
ctx := context.Background() ctx := context.Background()
// Init S3 // Init storage
if err := initS3(cfg.S3Bucket); err != nil { if err := initStorage(cfg.IconsDir); err != nil {
log.Fatalf("Failed to init S3: %v", err) log.Fatalf("Failed to init storage: %v", err)
} }
// Init DB pool // Init DB pool
@ -103,7 +103,7 @@ func main() {
fmt.Printf("Concurrency: %d\n", cfg.Concurrency) fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
fmt.Printf("Timeout: %s\n", cfg.Timeout) fmt.Printf("Timeout: %s\n", cfg.Timeout)
fmt.Printf("Max size: %dKB\n", cfg.MaxSize/1024) fmt.Printf("Max size: %dKB\n", cfg.MaxSize/1024)
fmt.Printf("S3 bucket: %s\n", cfg.S3Bucket) fmt.Printf("Icons dir: %s\n", cfg.IconsDir)
fmt.Printf("Dry run: %v\n\n", cfg.DryRun) fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
// Setup log file // Setup log file
@ -168,7 +168,7 @@ func main() {
} }
}() }()
result := processIcon(ctx, icon, cfg) result := processIcon(icon, cfg)
// Log line // Log line
logLine := formatLogLine(icon, result) logLine := formatLogLine(icon, result)

View file

@ -1,64 +0,0 @@
package main
import (
"bytes"
"context"
"errors"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/aws/aws-sdk-go-v2/service/s3/types"
)
var (
s3Client *s3.Client
bucket string
)
func initS3(bucketName string) error {
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
if err != nil {
return err
}
s3Client = s3.NewFromConfig(cfg)
bucket = bucketName
return nil
}
// s3Exists checks if an object already exists in S3 (for dedup).
func s3Exists(ctx context.Context, key string) (bool, error) {
_, err := s3Client.HeadObject(ctx, &s3.HeadObjectInput{
Bucket: aws.String(bucket),
Key: aws.String(key),
})
if err != nil {
var notFound *types.NotFound
if errors.As(err, &notFound) {
return false, nil
}
// NoSuchKey error type
var nsk *types.NoSuchKey
if errors.As(err, &nsk) {
return false, nil
}
// Some S3 errors return 404 as a generic error
if ctx.Err() != nil {
return false, ctx.Err()
}
// Treat other errors as "not found" to avoid blocking uploads
return false, nil
}
return true, nil
}
// s3Upload uploads icon data to S3 with the given key.
func s3Upload(ctx context.Context, key string, data []byte, contentType string) error {
_, err := s3Client.PutObject(ctx, &s3.PutObjectInput{
Bucket: aws.String(bucket),
Key: aws.String(key),
Body: bytes.NewReader(data),
ContentType: aws.String(contentType),
})
return err
}

View file

@ -0,0 +1,38 @@
package main
import (
"fmt"
"os"
"path/filepath"
)
var iconsDir string
func initStorage(dir string) error {
iconsDir = dir
return os.MkdirAll(dir, 0755)
}
// hashToPath converts a SHA-256 hex hash to a sharded file path.
// e.g., "abcdef1234..." → "icons/ab/cd/ef/abcdef1234..."
func hashToPath(hash string) string {
if len(hash) < 6 {
return filepath.Join(iconsDir, hash)
}
return filepath.Join(iconsDir, hash[:2], hash[2:4], hash[4:6], hash)
}
// iconExists checks if an icon is already stored on disk (for dedup).
func iconExists(hash string) bool {
_, err := os.Stat(hashToPath(hash))
return err == nil
}
// iconWrite saves icon data to disk in the sharded directory structure.
func iconWrite(hash string, data []byte) error {
path := hashToPath(hash)
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
return os.WriteFile(path, data, 0644)
}

View file

@ -1,7 +1,6 @@
package main package main
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
@ -24,7 +23,7 @@ type Bundle struct {
} }
// buildEntry creates a BundleEntry for a host, converting its icon if available. // buildEntry creates a BundleEntry for a host, converting its icon if available.
func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter *LogWriter, stats *Stats) BundleEntry { func buildEntry(host HostRow, iconsDir string, logWriter *LogWriter, stats *Stats) BundleEntry {
entry := BundleEntry{ entry := BundleEntry{
Host: host.Hostname, Host: host.Hostname,
Title: host.HtmlTitle, Title: host.HtmlTitle,
@ -36,7 +35,7 @@ func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter
return entry return entry
} }
encoded, w, h, convertErr := safeConvert(ctx, host.BestIconS3Key, iconsBucket) encoded, w, h, convertErr := safeConvert(host.BestIconS3Key, iconsDir)
if convertErr != "" { if convertErr != "" {
stats.ConvertErrors.Add(1) stats.ConvertErrors.Add(1)
logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr) logLine := fmt.Sprintf("CONVERT_ERROR: %s %s", host.Hostname, convertErr)
@ -54,7 +53,7 @@ func buildEntry(ctx context.Context, host HostRow, iconsBucket string, logWriter
} }
// safeConvert wraps convertIconToBase64PNG with panic recovery. // safeConvert wraps convertIconToBase64PNG with panic recovery.
func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string, w, h int, errMsg string) { func safeConvert(hash, iconsDir string) (encoded string, w, h int, errMsg string) {
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
errMsg = fmt.Sprintf("panic: %v", r) errMsg = fmt.Sprintf("panic: %v", r)
@ -62,7 +61,7 @@ func safeConvert(ctx context.Context, s3Key, iconsBucket string) (encoded string
}() }()
var err error var err error
encoded, w, h, err = convertIconToBase64PNG(ctx, s3Key, iconsBucket) encoded, w, h, err = convertIconToBase64PNG(hash, iconsDir)
if err != nil { if err != nil {
return "", 0, 0, err.Error() return "", 0, 0, err.Error()
} }
@ -79,7 +78,7 @@ func writeBundleLocal(outputDir string, index int, data []byte) error {
return os.WriteFile(path, data, 0644) return os.WriteFile(path, data, 0644)
} }
func writeBundleS3(ctx context.Context, bucket string, index int, data []byte) error { func writeBundleS3(bucket string, index int, data []byte) error {
key := fmt.Sprintf("tabs/%04d.json", index) key := fmt.Sprintf("tabs/%04d.json", index)
return s3UploadBundle(ctx, bucket, key, data) return s3UploadBundle(bucket, key, data)
} }

View file

@ -2,7 +2,6 @@ package main
import ( import (
"bytes" "bytes"
"context"
"encoding/base64" "encoding/base64"
"fmt" "fmt"
"image" "image"
@ -14,11 +13,11 @@ import (
_ "golang.org/x/image/webp" _ "golang.org/x/image/webp"
) )
// convertIconToBase64PNG downloads an icon from S3, converts it to PNG, and returns base64-encoded data. // convertIconToBase64PNG reads an icon from disk, converts it to PNG, and returns base64-encoded data.
func convertIconToBase64PNG(ctx context.Context, s3Key string, iconsBucket string) (encoded string, width, height int, err error) { func convertIconToBase64PNG(hash string, iconsDir string) (encoded string, width, height int, err error) {
data, err := s3Download(ctx, iconsBucket, s3Key) data, err := readIconFromDisk(iconsDir, hash)
if err != nil { if err != nil {
return "", 0, 0, fmt.Errorf("s3 download: %w", err) return "", 0, 0, fmt.Errorf("read icon: %w", err)
} }
// Check for SVG (can't decode to raster without external deps) // Check for SVG (can't decode to raster without external deps)

View file

@ -15,7 +15,7 @@ import (
type Config struct { type Config struct {
DBUrl string DBUrl string
IconsBucket string IconsDir string
SiteBucket string SiteBucket string
EntriesPerBundle int EntriesPerBundle int
Concurrency int Concurrency int
@ -39,7 +39,7 @@ type Stats struct {
func main() { func main() {
cfg := Config{} cfg := Config{}
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)") flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
flag.StringVar(&cfg.IconsBucket, "icons-bucket", "everytab-icons", "S3 bucket with downloaded icons") flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory with downloaded icons")
flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site") flag.StringVar(&cfg.SiteBucket, "site-bucket", "everytab-site", "S3 bucket for the static site")
flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file") flag.IntVar(&cfg.EntriesPerBundle, "entries-per-bundle", 120, "Tabs per bundle JSON file")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3") flag.BoolVar(&cfg.DryRun, "dry-run", false, "Write bundles to local disk instead of S3")
@ -122,7 +122,7 @@ func main() {
go func(idx int, h HostRow) { go func(idx int, h HostRow) {
defer wg.Done() defer wg.Done()
defer func() { <-sem }() defer func() { <-sem }()
entries[idx] = buildEntry(ctx, h, cfg.IconsBucket, logWriter, stats) entries[idx] = buildEntry(h, cfg.IconsDir, logWriter, stats)
n := processed.Add(1) n := processed.Add(1)
if n%5000 == 0 { if n%5000 == 0 {
fmt.Printf(" processed %d/%d hosts\n", n, len(hosts)) fmt.Printf(" processed %d/%d hosts\n", n, len(hosts))
@ -134,7 +134,7 @@ func main() {
// Clean old bundles before writing new ones (avoids orphans if count changed) // Clean old bundles before writing new ones (avoids orphans if count changed)
if !cfg.DryRun { if !cfg.DryRun {
fmt.Println("\nCleaning old bundles from S3...") fmt.Println("\nCleaning old bundles from S3...")
if err := s3DeletePrefix(ctx, cfg.SiteBucket, "tabs/"); err != nil { if err := s3DeletePrefix(cfg.SiteBucket, "tabs/"); err != nil {
log.Fatalf("Failed to clean old bundles: %v", err) log.Fatalf("Failed to clean old bundles: %v", err)
} }
} }
@ -160,7 +160,7 @@ func main() {
if cfg.DryRun { if cfg.DryRun {
err = writeBundleLocal(cfg.OutputDir, bundleIndex, data) err = writeBundleLocal(cfg.OutputDir, bundleIndex, data)
} else { } else {
err = writeBundleS3(ctx, cfg.SiteBucket, bundleIndex, data) err = writeBundleS3(cfg.SiteBucket, bundleIndex, data)
} }
if err != nil { if err != nil {
log.Fatalf("Failed to write bundle %d: %v", bundleIndex, err) log.Fatalf("Failed to write bundle %d: %v", bundleIndex, err)

View file

@ -4,7 +4,8 @@ import (
"bytes" "bytes"
"context" "context"
"fmt" "fmt"
"io" "os"
"path/filepath"
"github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/config"
@ -23,22 +24,22 @@ func initS3() error {
return nil return nil
} }
// s3Download fetches an object from S3. // iconHashToPath converts a SHA-256 hash to a sharded file path.
func s3Download(ctx context.Context, bucket, key string) ([]byte, error) { func iconHashToPath(iconsDir, hash string) string {
resp, err := s3Client.GetObject(ctx, &s3.GetObjectInput{ if len(hash) < 6 {
Bucket: aws.String(bucket), return filepath.Join(iconsDir, hash)
Key: aws.String(key),
})
if err != nil {
return nil, err
} }
defer resp.Body.Close() return filepath.Join(iconsDir, hash[:2], hash[2:4], hash[4:6], hash)
return io.ReadAll(resp.Body) }
// readIconFromDisk reads an icon file from the local sharded directory.
func readIconFromDisk(iconsDir, hash string) ([]byte, error) {
return os.ReadFile(iconHashToPath(iconsDir, hash))
} }
// s3UploadBundle uploads a bundle JSON to S3. // s3UploadBundle uploads a bundle JSON to S3.
func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error { func s3UploadBundle(bucket, key string, data []byte) error {
_, err := s3Client.PutObject(ctx, &s3.PutObjectInput{ _, err := s3Client.PutObject(context.Background(), &s3.PutObjectInput{
Bucket: aws.String(bucket), Bucket: aws.String(bucket),
Key: aws.String(key), Key: aws.String(key),
Body: bytes.NewReader(data), Body: bytes.NewReader(data),
@ -48,7 +49,8 @@ func s3UploadBundle(ctx context.Context, bucket, key string, data []byte) error
} }
// s3DeletePrefix deletes all objects under a prefix in S3. // s3DeletePrefix deletes all objects under a prefix in S3.
func s3DeletePrefix(ctx context.Context, bucket, prefix string) error { func s3DeletePrefix(bucket, prefix string) error {
ctx := context.Background()
paginator := s3.NewListObjectsV2Paginator(s3Client, &s3.ListObjectsV2Input{ paginator := s3.NewListObjectsV2Paginator(s3Client, &s3.ListObjectsV2Input{
Bucket: aws.String(bucket), Bucket: aws.String(bucket),
Prefix: aws.String(prefix), Prefix: aws.String(prefix),