diff --git a/infra/ec2-userdata.sh b/infra/ec2-userdata.sh index 078c29b..594ccfa 100755 --- a/infra/ec2-userdata.sh +++ b/infra/ec2-userdata.sh @@ -52,6 +52,13 @@ if ! command -v go &>/dev/null; then fi go version +# --- esbuild --- +echo "--- Installing esbuild ---" +if ! command -v esbuild &>/dev/null; then + GOBIN=/usr/local/bin /usr/local/go/bin/go install github.com/evanw/esbuild/cmd/esbuild@latest +fi +esbuild --version + # --- DuckDB --- echo "--- Installing DuckDB ---" DUCKDB_VERSION="1.5.2" diff --git a/infra/main.tf b/infra/main.tf index 47ea942..ed2e592 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -190,6 +190,11 @@ resource "aws_iam_role_policy" "s3_access" { "arn:aws:s3:::commoncrawl", "arn:aws:s3:::commoncrawl/*", ] + }, + { + Effect = "Allow" + Action = ["cloudfront:CreateInvalidation", "cloudfront:ListDistributions"] + Resource = "*" } ] }) diff --git a/pipeline/06_frontend/deploy.sh b/pipeline/06_frontend/deploy.sh index ba4641c..0f78ede 100755 --- a/pipeline/06_frontend/deploy.sh +++ b/pipeline/06_frontend/deploy.sh @@ -60,16 +60,25 @@ echo "Total bundles: $TOTAL_BUNDLES" echo "S3 bucket: $SITE_BUCKET" echo "" -# Inject TOTAL_BUNDLES into index.html (in a temp copy) +# Build into temp directory TMPDIR=$(mktemp -d) cp "$FRONTEND_DIR/index.html" "$TMPDIR/index.html" cp "$FRONTEND_DIR/site.js" "$TMPDIR/site.js" cp "$FRONTEND_DIR/bot.html" "$TMPDIR/bot.html" cp "$FRONTEND_DIR/about.html" "$TMPDIR/about.html" +# Inject TOTAL_BUNDLES sed -i "s/const TOTAL_BUNDLES = [0-9]*/const TOTAL_BUNDLES = ${TOTAL_BUNDLES}/" "$TMPDIR/index.html" echo "Injected TOTAL_BUNDLES = $TOTAL_BUNDLES" +# Minify JS (strip comments + whitespace, keep variable names) +if command -v esbuild &>/dev/null; then + esbuild "$TMPDIR/site.js" --minify --outfile="$TMPDIR/site.js" --allow-overwrite + echo "Minified site.js" +else + echo "Warning: esbuild not found, deploying unminified JS" +fi + # Upload echo "Uploading to s3://$SITE_BUCKET/..." aws s3 cp "$TMPDIR/index.html" "s3://$SITE_BUCKET/" --content-type "text/html" @@ -80,6 +89,28 @@ echo "Uploaded 4 files" rm -rf "$TMPDIR" +# Clean up stale bundles from previous runs +echo "Cleaning stale bundles above $TOTAL_BUNDLES..." +STALE=$(aws s3api list-objects-v2 --bucket "$SITE_BUCKET" --prefix "tabs/" --query "Contents[].Key" --output text \ + | tr '\t' '\n' \ + | while read -r key; do + num=$(echo "$key" | grep -oP '\d+' || true) + if [ -n "$num" ] && [ "$((10#$num))" -ge "$TOTAL_BUNDLES" ]; then + echo "$key" + fi + done) + +if [ -n "$STALE" ]; then + STALE_COUNT=$(echo "$STALE" | wc -l) + echo "Deleting $STALE_COUNT stale bundles..." + echo "$STALE" | while read -r key; do + aws s3 rm "s3://$SITE_BUCKET/$key" --quiet + done + echo "Deleted $STALE_COUNT stale bundles" +else + echo "No stale bundles found" +fi + # Invalidate CloudFront if ! $SKIP_INVALIDATION; then if [ -z "$DIST_ID" ]; then diff --git a/pipeline/README.md b/pipeline/README.md index fa0e43a..60e7647 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -11,10 +11,9 @@ Between stages, run the sanity checks to confirm data looks right before proceed export DATABASE_URL='postgres://everytab@:5432/everytab' # Go binaries built on EC2 -cd ~/everytab -go build -o ~/warc_parse ./pipeline/02_warc_parse/ -go build -o ~/icon_download ./pipeline/03_icon_download/ -go build -o ~/bundle_gen ./pipeline/05_bundle_gen/ +go build -o ~/warc_parse ./everytab/pipeline/02_warc_parse/ +go build -o ~/icon_download ./everytab/pipeline/03_icon_download/ +go build -o ~/bundle_gen ./everytab/pipeline/05_bundle_gen/ ``` ## Stage 1: CC-Index Query @@ -22,7 +21,7 @@ go build -o ~/bundle_gen ./pipeline/05_bundle_gen/ Populates the `hosts` table from Common Crawl's columnar index. ```bash -./pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000 +./everytab/pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000 # Full run: --limit 0 ``` @@ -47,7 +46,7 @@ GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.l Picks the best icon per host for display. ```bash -psql $DATABASE_URL -f pipeline/04_best_icon/select.sql +psql $DATABASE_URL -f ./everytab/pipeline/04_best_icon/select.sql ``` ## Stage 5: Bundle Generation @@ -62,12 +61,20 @@ Note the `TOTAL_BUNDLES` number from the summary — this gets baked into the fr ## Stage 6: Frontend Deploy -From your local machine: +From EC2, after bundle gen completes: ```bash -./pipeline/06_frontend/deploy.sh --total-bundles +TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json) +./everytab/pipeline/06_frontend/deploy.sh --total-bundles "$TOTAL_BUNDLES" ``` +The deploy script: +1. Injects TOTAL_BUNDLES into index.html +2. Minifies site.js (via esbuild, strips comments + whitespace) +3. Uploads frontend files to S3 +4. Deletes stale bundles from previous runs (numbers ≥ TOTAL_BUNDLES) +5. Invalidates CloudFront cache + ## Stage 7: Backup to Homelab After the site is deployed and verified, backup data before tearing down scanning infra. @@ -76,7 +83,7 @@ After the site is deployed and verified, backup data before tearing down scannin | Data | Location on EC2 | Size estimate (30M) | Purpose | |------|----------------|---------------------|---------| -| Database | RDS (pg_dump) | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates | +| Database | pg_dump from i3 instance | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates | | Icons | `~/icons/` directory | ~500GB-1TB | Complete favicon archive, content-addressed by SHA-256 | | Stats | `~/stats/*.json` | <1MB | Pipeline timing and counts per stage | | Logs | `~/*.log` | varies | Error logs for debugging |