134 lines
3.2 KiB
Go
134 lines
3.2 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"compress/gzip"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/aws"
|
|
"github.com/aws/aws-sdk-go-v2/aws/retry"
|
|
"github.com/aws/aws-sdk-go-v2/config"
|
|
"github.com/aws/aws-sdk-go-v2/service/s3"
|
|
"github.com/nlnwa/gowarc/v3"
|
|
)
|
|
|
|
const ccBucket = "commoncrawl"
|
|
|
|
var s3Client *s3.Client
|
|
|
|
func initS3() error {
|
|
cfg, err := config.LoadDefaultConfig(context.Background(),
|
|
config.WithRegion("us-east-1"),
|
|
config.WithRetryer(func() aws.Retryer {
|
|
return retry.AddWithMaxAttempts(retry.NewStandard(), 6)
|
|
}),
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("load AWS config: %w", err)
|
|
}
|
|
s3Client = s3.NewFromConfig(cfg)
|
|
return nil
|
|
}
|
|
|
|
// WARCResult holds the extracted data from a WARC response record.
|
|
type WARCResult struct {
|
|
HTTPHeaders http.Header
|
|
Body []byte
|
|
}
|
|
|
|
// FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it.
|
|
func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) {
|
|
rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1)
|
|
|
|
resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{
|
|
Bucket: aws.String(ccBucket),
|
|
Key: aws.String(warcFilename),
|
|
Range: aws.String(rangeHeader),
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("s3 get: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Each WARC record is individually gzipped
|
|
gzReader, err := gzip.NewReader(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("gzip: %w", err)
|
|
}
|
|
defer gzReader.Close()
|
|
|
|
// Read all decompressed data into memory for gowarc
|
|
decompressed, err := io.ReadAll(gzReader)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("decompress: %w", err)
|
|
}
|
|
|
|
// Parse WARC record using gowarc
|
|
warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("warc reader: %w", err)
|
|
}
|
|
defer warcReader.Close()
|
|
|
|
rec, err := warcReader.Next()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read record: %w", err)
|
|
}
|
|
defer rec.Close()
|
|
|
|
if rec.WarcRecord.Type() != gowarc.Response {
|
|
return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type())
|
|
}
|
|
|
|
block := rec.WarcRecord.Block()
|
|
httpBlock, ok := block.(gowarc.HttpResponseBlock)
|
|
if !ok {
|
|
return nil, fmt.Errorf("block is not HTTP response")
|
|
}
|
|
|
|
// Get HTTP response headers
|
|
var httpHeaders http.Header
|
|
headers := httpBlock.HttpHeader()
|
|
if headers != nil {
|
|
httpHeaders = *headers
|
|
} else {
|
|
httpHeaders = make(http.Header)
|
|
}
|
|
|
|
// Get HTTP body (the HTML)
|
|
bodyReader, err := httpBlock.PayloadBytes()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("payload: %w", err)
|
|
}
|
|
|
|
body, err := io.ReadAll(bodyReader)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read body: %w", err)
|
|
}
|
|
|
|
return &WARCResult{
|
|
HTTPHeaders: httpHeaders,
|
|
Body: body,
|
|
}, nil
|
|
}
|
|
|
|
// CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors.
|
|
func CheckIframeAllowed(headers http.Header) bool {
|
|
xfo := strings.ToLower(headers.Get("X-Frame-Options"))
|
|
if xfo == "deny" || xfo == "sameorigin" {
|
|
return false
|
|
}
|
|
|
|
for _, csp := range headers.Values("Content-Security-Policy") {
|
|
csp = strings.ToLower(csp)
|
|
if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|