package main import ( "bytes" "compress/gzip" "context" "fmt" "io" "net/http" "strings" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/nlnwa/gowarc/v3" ) const ccBucket = "commoncrawl" var s3Client *s3.Client func initS3() error { cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1")) if err != nil { return fmt.Errorf("load AWS config: %w", err) } s3Client = s3.NewFromConfig(cfg) return nil } // WARCResult holds the extracted data from a WARC response record. type WARCResult struct { HTTPHeaders http.Header Body []byte } // FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it. func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) { rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1) resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{ Bucket: aws.String(ccBucket), Key: aws.String(warcFilename), Range: aws.String(rangeHeader), }) if err != nil { return nil, fmt.Errorf("s3 get: %w", err) } defer resp.Body.Close() // Each WARC record is individually gzipped gzReader, err := gzip.NewReader(resp.Body) if err != nil { return nil, fmt.Errorf("gzip: %w", err) } defer gzReader.Close() // Read all decompressed data into memory for gowarc decompressed, err := io.ReadAll(gzReader) if err != nil { return nil, fmt.Errorf("decompress: %w", err) } // Parse WARC record using gowarc warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0) if err != nil { return nil, fmt.Errorf("warc reader: %w", err) } defer warcReader.Close() rec, err := warcReader.Next() if err != nil { return nil, fmt.Errorf("read record: %w", err) } defer rec.Close() if rec.WarcRecord.Type() != gowarc.Response { return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type()) } block := rec.WarcRecord.Block() httpBlock, ok := block.(gowarc.HttpResponseBlock) if !ok { return nil, fmt.Errorf("block is not HTTP response") } // Get HTTP response headers var httpHeaders http.Header headers := httpBlock.HttpHeader() if headers != nil { httpHeaders = *headers } else { httpHeaders = make(http.Header) } // Get HTTP body (the HTML) bodyReader, err := httpBlock.PayloadBytes() if err != nil { return nil, fmt.Errorf("payload: %w", err) } body, err := io.ReadAll(bodyReader) if err != nil { return nil, fmt.Errorf("read body: %w", err) } return &WARCResult{ HTTPHeaders: httpHeaders, Body: body, }, nil } // CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors. func CheckIframeAllowed(headers http.Header) bool { xfo := strings.ToLower(headers.Get("X-Frame-Options")) if xfo == "deny" || xfo == "sameorigin" { return false } csp := strings.ToLower(headers.Get("Content-Security-Policy")) if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") { return false } return true }