switched from s3 to disk for saving icons
This commit is contained in:
parent
113a261dae
commit
5b3f6a6870
9 changed files with 84 additions and 112 deletions
|
|
@ -1,7 +1,6 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
|
|
@ -12,8 +11,8 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
// processIcon downloads, validates, and uploads a single icon.
|
||||
func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult {
|
||||
// processIcon downloads, validates, and stores a single icon.
|
||||
func processIcon(icon IconRow, cfg Config) DownloadResult {
|
||||
// Download
|
||||
data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize)
|
||||
if err != nil {
|
||||
|
|
@ -39,15 +38,14 @@ func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult {
|
|||
hash := sha256.Sum256(data)
|
||||
s3Key := hex.EncodeToString(hash[:])
|
||||
|
||||
// Upload to S3 (skip if already exists — dedup)
|
||||
// Write to disk (skip if already exists — dedup)
|
||||
dedup := false
|
||||
if !cfg.DryRun {
|
||||
exists, err := s3Exists(ctx, s3Key)
|
||||
if err == nil && exists {
|
||||
if iconExists(s3Key) {
|
||||
dedup = true
|
||||
} else {
|
||||
if err := s3Upload(ctx, s3Key, data, contentType); err != nil {
|
||||
return DownloadResult{Err: fmt.Sprintf("s3 upload: %v", err), ErrType: "other"}
|
||||
if err := iconWrite(s3Key, data); err != nil {
|
||||
return DownloadResult{Err: fmt.Sprintf("disk write: %v", err), ErrType: "other"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ import (
|
|||
|
||||
type Config struct {
|
||||
DBUrl string
|
||||
S3Bucket string
|
||||
IconsDir string
|
||||
BatchSize int
|
||||
Concurrency int
|
||||
Limit int
|
||||
|
|
@ -45,13 +45,13 @@ type Stats struct {
|
|||
func main() {
|
||||
cfg := Config{}
|
||||
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
|
||||
flag.StringVar(&cfg.S3Bucket, "s3-bucket", "everytab-icons", "S3 bucket for icons")
|
||||
flag.StringVar(&cfg.IconsDir, "icons-dir", "icons", "Directory to store downloaded icons")
|
||||
flag.IntVar(&cfg.BatchSize, "batch-size", 200, "Rows to claim per batch")
|
||||
flag.IntVar(&cfg.Concurrency, "concurrency", 200, "Number of concurrent goroutines")
|
||||
flag.IntVar(&cfg.Limit, "limit", 0, "Max icons to process (0 = all)")
|
||||
flag.DurationVar(&cfg.Timeout, "timeout", 10*time.Second, "HTTP request timeout")
|
||||
flag.Int64Var(&cfg.MaxSize, "max-size", 512*1024, "Max icon download size in bytes")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't upload to S3 or update DB")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Download but don't write to disk or update DB")
|
||||
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
|
||||
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
|
||||
flag.Parse()
|
||||
|
|
@ -64,9 +64,9 @@ func main() {
|
|||
|
||||
ctx := context.Background()
|
||||
|
||||
// Init S3
|
||||
if err := initS3(cfg.S3Bucket); err != nil {
|
||||
log.Fatalf("Failed to init S3: %v", err)
|
||||
// Init storage
|
||||
if err := initStorage(cfg.IconsDir); err != nil {
|
||||
log.Fatalf("Failed to init storage: %v", err)
|
||||
}
|
||||
|
||||
// Init DB pool
|
||||
|
|
@ -103,7 +103,7 @@ func main() {
|
|||
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
|
||||
fmt.Printf("Timeout: %s\n", cfg.Timeout)
|
||||
fmt.Printf("Max size: %dKB\n", cfg.MaxSize/1024)
|
||||
fmt.Printf("S3 bucket: %s\n", cfg.S3Bucket)
|
||||
fmt.Printf("Icons dir: %s\n", cfg.IconsDir)
|
||||
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
|
||||
|
||||
// Setup log file
|
||||
|
|
@ -168,7 +168,7 @@ func main() {
|
|||
}
|
||||
}()
|
||||
|
||||
result := processIcon(ctx, icon, cfg)
|
||||
result := processIcon(icon, cfg)
|
||||
|
||||
// Log line
|
||||
logLine := formatLogLine(icon, result)
|
||||
|
|
|
|||
|
|
@ -1,64 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/config"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3/types"
|
||||
)
|
||||
|
||||
var (
|
||||
s3Client *s3.Client
|
||||
bucket string
|
||||
)
|
||||
|
||||
func initS3(bucketName string) error {
|
||||
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s3Client = s3.NewFromConfig(cfg)
|
||||
bucket = bucketName
|
||||
return nil
|
||||
}
|
||||
|
||||
// s3Exists checks if an object already exists in S3 (for dedup).
|
||||
func s3Exists(ctx context.Context, key string) (bool, error) {
|
||||
_, err := s3Client.HeadObject(ctx, &s3.HeadObjectInput{
|
||||
Bucket: aws.String(bucket),
|
||||
Key: aws.String(key),
|
||||
})
|
||||
if err != nil {
|
||||
var notFound *types.NotFound
|
||||
if errors.As(err, ¬Found) {
|
||||
return false, nil
|
||||
}
|
||||
// NoSuchKey error type
|
||||
var nsk *types.NoSuchKey
|
||||
if errors.As(err, &nsk) {
|
||||
return false, nil
|
||||
}
|
||||
// Some S3 errors return 404 as a generic error
|
||||
if ctx.Err() != nil {
|
||||
return false, ctx.Err()
|
||||
}
|
||||
// Treat other errors as "not found" to avoid blocking uploads
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// s3Upload uploads icon data to S3 with the given key.
|
||||
func s3Upload(ctx context.Context, key string, data []byte, contentType string) error {
|
||||
_, err := s3Client.PutObject(ctx, &s3.PutObjectInput{
|
||||
Bucket: aws.String(bucket),
|
||||
Key: aws.String(key),
|
||||
Body: bytes.NewReader(data),
|
||||
ContentType: aws.String(contentType),
|
||||
})
|
||||
return err
|
||||
}
|
||||
38
pipeline/03_icon_download/storage.go
Normal file
38
pipeline/03_icon_download/storage.go
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
var iconsDir string
|
||||
|
||||
func initStorage(dir string) error {
|
||||
iconsDir = dir
|
||||
return os.MkdirAll(dir, 0755)
|
||||
}
|
||||
|
||||
// hashToPath converts a SHA-256 hex hash to a sharded file path.
|
||||
// e.g., "abcdef1234..." → "icons/ab/cd/ef/abcdef1234..."
|
||||
func hashToPath(hash string) string {
|
||||
if len(hash) < 6 {
|
||||
return filepath.Join(iconsDir, hash)
|
||||
}
|
||||
return filepath.Join(iconsDir, hash[:2], hash[2:4], hash[4:6], hash)
|
||||
}
|
||||
|
||||
// iconExists checks if an icon is already stored on disk (for dedup).
|
||||
func iconExists(hash string) bool {
|
||||
_, err := os.Stat(hashToPath(hash))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// iconWrite saves icon data to disk in the sharded directory structure.
|
||||
func iconWrite(hash string, data []byte) error {
|
||||
path := hashToPath(hash)
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir: %w", err)
|
||||
}
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue