everytab/pipeline/03_icon_download/download.go
2026-05-17 22:09:03 -04:00

157 lines
3.8 KiB
Go

package main
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net"
"net/http"
"strings"
"time"
)
// processIcon downloads, validates, and uploads a single icon.
func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult {
// Download
data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize)
if err != nil {
errType := classifyError(err)
return DownloadResult{Err: err.Error(), ErrType: errType}
}
// Validate it's an image
detectedType := detectImageType(data)
if detectedType == "" {
return DownloadResult{Err: "not a valid image", ErrType: "invalid"}
}
// Use detected type over HTTP Content-Type (more reliable)
if contentType == "" || contentType == "application/octet-stream" {
contentType = detectedType
}
// Get dimensions
width, height := getImageDimensions(data, detectedType)
// Compute SHA-256 for content-addressed storage
hash := sha256.Sum256(data)
s3Key := hex.EncodeToString(hash[:])
// Upload to S3 (skip if already exists — dedup)
dedup := false
if !cfg.DryRun {
exists, err := s3Exists(ctx, s3Key)
if err == nil && exists {
dedup = true
} else {
if err := s3Upload(ctx, s3Key, data, contentType); err != nil {
return DownloadResult{Err: fmt.Sprintf("s3 upload: %v", err), ErrType: "other"}
}
}
}
return DownloadResult{
S3Key: s3Key,
ContentType: contentType,
Width: width,
Height: height,
FileSize: len(data),
Dedup: dedup,
}
}
// httpTransport is shared across all goroutines for connection pooling and TLS session reuse.
var httpTransport = &http.Transport{
MaxIdleConns: 1000,
MaxIdleConnsPerHost: 2,
IdleConnTimeout: 30 * time.Second,
DisableKeepAlives: false,
DialContext: (&net.Dialer{
Timeout: 5 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: 5 * time.Second,
}
// downloadIcon fetches an icon URL with timeouts and size limits.
func downloadIcon(url string, timeout time.Duration, maxSize int64) ([]byte, string, error) {
client := &http.Client{
Timeout: timeout,
Transport: httpTransport,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 3 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, "", fmt.Errorf("bad url: %w", err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; EveryTabBot/1.0; +https://everytab.site/bot)")
resp, err := client.Do(req)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, "", fmt.Errorf("http %d", resp.StatusCode)
}
// Read with size limit
limited := io.LimitReader(resp.Body, maxSize+1)
data, err := io.ReadAll(limited)
if err != nil {
return nil, "", fmt.Errorf("read: %w", err)
}
if int64(len(data)) > maxSize {
return nil, "", fmt.Errorf("exceeds %dKB", maxSize/1024)
}
contentType := resp.Header.Get("Content-Type")
// Strip charset suffix
if idx := strings.Index(contentType, ";"); idx != -1 {
contentType = strings.TrimSpace(contentType[:idx])
}
return data, contentType, nil
}
// classifyError categorizes a download error for stats.
func classifyError(err error) string {
msg := err.Error()
// DNS errors
if _, ok := err.(*net.DNSError); ok {
return "dns"
}
if strings.Contains(msg, "no such host") || strings.Contains(msg, "dns") {
return "dns"
}
// Timeouts
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
return "timeout"
}
if strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline") {
return "timeout"
}
// HTTP errors
if strings.Contains(msg, "http ") {
return "http"
}
// Too large
if strings.Contains(msg, "exceeds") {
return "too_large"
}
return "other"
}