added warc parser

This commit is contained in:
Joe Lothan 2026-05-17 20:25:59 -04:00
parent db81015e0b
commit f45e4a6034
8 changed files with 954 additions and 0 deletions

View file

@ -0,0 +1,126 @@
package main
import (
"bytes"
"compress/gzip"
"context"
"fmt"
"io"
"net/http"
"strings"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/nlnwa/gowarc/v3"
)
const ccBucket = "commoncrawl"
var s3Client *s3.Client
func initS3() error {
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
if err != nil {
return fmt.Errorf("load AWS config: %w", err)
}
s3Client = s3.NewFromConfig(cfg)
return nil
}
// WARCResult holds the extracted data from a WARC response record.
type WARCResult struct {
HTTPHeaders http.Header
Body []byte
}
// FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it.
func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) {
rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1)
resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{
Bucket: aws.String(ccBucket),
Key: aws.String(warcFilename),
Range: aws.String(rangeHeader),
})
if err != nil {
return nil, fmt.Errorf("s3 get: %w", err)
}
defer resp.Body.Close()
// Each WARC record is individually gzipped
gzReader, err := gzip.NewReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("gzip: %w", err)
}
defer gzReader.Close()
// Read all decompressed data into memory for gowarc
decompressed, err := io.ReadAll(gzReader)
if err != nil {
return nil, fmt.Errorf("decompress: %w", err)
}
// Parse WARC record using gowarc
warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0)
if err != nil {
return nil, fmt.Errorf("warc reader: %w", err)
}
defer warcReader.Close()
rec, err := warcReader.Next()
if err != nil {
return nil, fmt.Errorf("read record: %w", err)
}
defer rec.Close()
if rec.WarcRecord.Type() != gowarc.Response {
return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type())
}
block := rec.WarcRecord.Block()
httpBlock, ok := block.(gowarc.HttpResponseBlock)
if !ok {
return nil, fmt.Errorf("block is not HTTP response")
}
// Get HTTP response headers
var httpHeaders http.Header
headers := httpBlock.HttpHeader()
if headers != nil {
httpHeaders = *headers
} else {
httpHeaders = make(http.Header)
}
// Get HTTP body (the HTML)
bodyReader, err := httpBlock.PayloadBytes()
if err != nil {
return nil, fmt.Errorf("payload: %w", err)
}
body, err := io.ReadAll(bodyReader)
if err != nil {
return nil, fmt.Errorf("read body: %w", err)
}
return &WARCResult{
HTTPHeaders: httpHeaders,
Body: body,
}, nil
}
// CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors.
func CheckIframeAllowed(headers http.Header) bool {
xfo := strings.ToLower(headers.Get("X-Frame-Options"))
if xfo == "deny" || xfo == "sameorigin" {
return false
}
csp := strings.ToLower(headers.Get("Content-Security-Policy"))
if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") {
return false
}
return true
}