added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
126
pipeline/02_warc_parse/warc.go
Normal file
126
pipeline/02_warc_parse/warc.go
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/config"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/nlnwa/gowarc/v3"
|
||||
)
|
||||
|
||||
const ccBucket = "commoncrawl"
|
||||
|
||||
var s3Client *s3.Client
|
||||
|
||||
func initS3() error {
|
||||
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("load AWS config: %w", err)
|
||||
}
|
||||
s3Client = s3.NewFromConfig(cfg)
|
||||
return nil
|
||||
}
|
||||
|
||||
// WARCResult holds the extracted data from a WARC response record.
|
||||
type WARCResult struct {
|
||||
HTTPHeaders http.Header
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it.
|
||||
func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) {
|
||||
rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1)
|
||||
|
||||
resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{
|
||||
Bucket: aws.String(ccBucket),
|
||||
Key: aws.String(warcFilename),
|
||||
Range: aws.String(rangeHeader),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("s3 get: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Each WARC record is individually gzipped
|
||||
gzReader, err := gzip.NewReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gzip: %w", err)
|
||||
}
|
||||
defer gzReader.Close()
|
||||
|
||||
// Read all decompressed data into memory for gowarc
|
||||
decompressed, err := io.ReadAll(gzReader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("decompress: %w", err)
|
||||
}
|
||||
|
||||
// Parse WARC record using gowarc
|
||||
warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("warc reader: %w", err)
|
||||
}
|
||||
defer warcReader.Close()
|
||||
|
||||
rec, err := warcReader.Next()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read record: %w", err)
|
||||
}
|
||||
defer rec.Close()
|
||||
|
||||
if rec.WarcRecord.Type() != gowarc.Response {
|
||||
return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type())
|
||||
}
|
||||
|
||||
block := rec.WarcRecord.Block()
|
||||
httpBlock, ok := block.(gowarc.HttpResponseBlock)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("block is not HTTP response")
|
||||
}
|
||||
|
||||
// Get HTTP response headers
|
||||
var httpHeaders http.Header
|
||||
headers := httpBlock.HttpHeader()
|
||||
if headers != nil {
|
||||
httpHeaders = *headers
|
||||
} else {
|
||||
httpHeaders = make(http.Header)
|
||||
}
|
||||
|
||||
// Get HTTP body (the HTML)
|
||||
bodyReader, err := httpBlock.PayloadBytes()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("payload: %w", err)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(bodyReader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read body: %w", err)
|
||||
}
|
||||
|
||||
return &WARCResult{
|
||||
HTTPHeaders: httpHeaders,
|
||||
Body: body,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors.
|
||||
func CheckIframeAllowed(headers http.Header) bool {
|
||||
xfo := strings.ToLower(headers.Get("X-Frame-Options"))
|
||||
if xfo == "deny" || xfo == "sameorigin" {
|
||||
return false
|
||||
}
|
||||
|
||||
csp := strings.ToLower(headers.Get("Content-Security-Policy"))
|
||||
if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue