added icon downloader
This commit is contained in:
parent
8b5693b5c6
commit
5a2e37ae06
10 changed files with 829 additions and 68 deletions
157
pipeline/03_icon_download/download.go
Normal file
157
pipeline/03_icon_download/download.go
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// processIcon downloads, validates, and uploads a single icon.
|
||||
func processIcon(ctx context.Context, icon IconRow, cfg Config) DownloadResult {
|
||||
// Download
|
||||
data, contentType, err := downloadIcon(icon.URL, cfg.Timeout, cfg.MaxSize)
|
||||
if err != nil {
|
||||
errType := classifyError(err)
|
||||
return DownloadResult{Err: err.Error(), ErrType: errType}
|
||||
}
|
||||
|
||||
// Validate it's an image
|
||||
detectedType := detectImageType(data)
|
||||
if detectedType == "" {
|
||||
return DownloadResult{Err: "not a valid image", ErrType: "invalid"}
|
||||
}
|
||||
|
||||
// Use detected type over HTTP Content-Type (more reliable)
|
||||
if contentType == "" || contentType == "application/octet-stream" {
|
||||
contentType = detectedType
|
||||
}
|
||||
|
||||
// Get dimensions
|
||||
width, height := getImageDimensions(data, detectedType)
|
||||
|
||||
// Compute SHA-256 for content-addressed storage
|
||||
hash := sha256.Sum256(data)
|
||||
s3Key := hex.EncodeToString(hash[:])
|
||||
|
||||
// Upload to S3 (skip if already exists — dedup)
|
||||
dedup := false
|
||||
if !cfg.DryRun {
|
||||
exists, err := s3Exists(ctx, s3Key)
|
||||
if err == nil && exists {
|
||||
dedup = true
|
||||
} else {
|
||||
if err := s3Upload(ctx, s3Key, data, contentType); err != nil {
|
||||
return DownloadResult{Err: fmt.Sprintf("s3 upload: %v", err), ErrType: "other"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return DownloadResult{
|
||||
S3Key: s3Key,
|
||||
ContentType: contentType,
|
||||
Width: width,
|
||||
Height: height,
|
||||
FileSize: len(data),
|
||||
Dedup: dedup,
|
||||
}
|
||||
}
|
||||
|
||||
// httpTransport is shared across all goroutines for connection pooling and TLS session reuse.
|
||||
var httpTransport = &http.Transport{
|
||||
MaxIdleConns: 1000,
|
||||
MaxIdleConnsPerHost: 2,
|
||||
IdleConnTimeout: 30 * time.Second,
|
||||
DisableKeepAlives: false,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 5 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).DialContext,
|
||||
TLSHandshakeTimeout: 5 * time.Second,
|
||||
}
|
||||
|
||||
// downloadIcon fetches an icon URL with timeouts and size limits.
|
||||
func downloadIcon(url string, timeout time.Duration, maxSize int64) ([]byte, string, error) {
|
||||
client := &http.Client{
|
||||
Timeout: timeout,
|
||||
Transport: httpTransport,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 3 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("bad url: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; EveryTabBot/1.0; +https://everytab.site/bot)")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, "", fmt.Errorf("http %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Read with size limit
|
||||
limited := io.LimitReader(resp.Body, maxSize+1)
|
||||
data, err := io.ReadAll(limited)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("read: %w", err)
|
||||
}
|
||||
if int64(len(data)) > maxSize {
|
||||
return nil, "", fmt.Errorf("exceeds %dKB", maxSize/1024)
|
||||
}
|
||||
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
// Strip charset suffix
|
||||
if idx := strings.Index(contentType, ";"); idx != -1 {
|
||||
contentType = strings.TrimSpace(contentType[:idx])
|
||||
}
|
||||
|
||||
return data, contentType, nil
|
||||
}
|
||||
|
||||
// classifyError categorizes a download error for stats.
|
||||
func classifyError(err error) string {
|
||||
msg := err.Error()
|
||||
|
||||
// DNS errors
|
||||
if _, ok := err.(*net.DNSError); ok {
|
||||
return "dns"
|
||||
}
|
||||
if strings.Contains(msg, "no such host") || strings.Contains(msg, "dns") {
|
||||
return "dns"
|
||||
}
|
||||
|
||||
// Timeouts
|
||||
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
|
||||
return "timeout"
|
||||
}
|
||||
if strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline") {
|
||||
return "timeout"
|
||||
}
|
||||
|
||||
// HTTP errors
|
||||
if strings.Contains(msg, "http ") {
|
||||
return "http"
|
||||
}
|
||||
|
||||
// Too large
|
||||
if strings.Contains(msg, "exceeds") {
|
||||
return "too_large"
|
||||
}
|
||||
|
||||
return "other"
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue