deploy frontend from the ec2 at the end of the pipeline

This commit is contained in:
Joe Lothan 2026-05-25 23:21:50 -04:00
parent 8d62832c1d
commit 8e3907505f
4 changed files with 60 additions and 10 deletions

View file

@ -52,6 +52,13 @@ if ! command -v go &>/dev/null; then
fi fi
go version go version
# --- esbuild ---
echo "--- Installing esbuild ---"
if ! command -v esbuild &>/dev/null; then
GOBIN=/usr/local/bin /usr/local/go/bin/go install github.com/evanw/esbuild/cmd/esbuild@latest
fi
esbuild --version
# --- DuckDB --- # --- DuckDB ---
echo "--- Installing DuckDB ---" echo "--- Installing DuckDB ---"
DUCKDB_VERSION="1.5.2" DUCKDB_VERSION="1.5.2"

View file

@ -190,6 +190,11 @@ resource "aws_iam_role_policy" "s3_access" {
"arn:aws:s3:::commoncrawl", "arn:aws:s3:::commoncrawl",
"arn:aws:s3:::commoncrawl/*", "arn:aws:s3:::commoncrawl/*",
] ]
},
{
Effect = "Allow"
Action = ["cloudfront:CreateInvalidation", "cloudfront:ListDistributions"]
Resource = "*"
} }
] ]
}) })

View file

@ -60,16 +60,25 @@ echo "Total bundles: $TOTAL_BUNDLES"
echo "S3 bucket: $SITE_BUCKET" echo "S3 bucket: $SITE_BUCKET"
echo "" echo ""
# Inject TOTAL_BUNDLES into index.html (in a temp copy) # Build into temp directory
TMPDIR=$(mktemp -d) TMPDIR=$(mktemp -d)
cp "$FRONTEND_DIR/index.html" "$TMPDIR/index.html" cp "$FRONTEND_DIR/index.html" "$TMPDIR/index.html"
cp "$FRONTEND_DIR/site.js" "$TMPDIR/site.js" cp "$FRONTEND_DIR/site.js" "$TMPDIR/site.js"
cp "$FRONTEND_DIR/bot.html" "$TMPDIR/bot.html" cp "$FRONTEND_DIR/bot.html" "$TMPDIR/bot.html"
cp "$FRONTEND_DIR/about.html" "$TMPDIR/about.html" cp "$FRONTEND_DIR/about.html" "$TMPDIR/about.html"
# Inject TOTAL_BUNDLES
sed -i "s/const TOTAL_BUNDLES = [0-9]*/const TOTAL_BUNDLES = ${TOTAL_BUNDLES}/" "$TMPDIR/index.html" sed -i "s/const TOTAL_BUNDLES = [0-9]*/const TOTAL_BUNDLES = ${TOTAL_BUNDLES}/" "$TMPDIR/index.html"
echo "Injected TOTAL_BUNDLES = $TOTAL_BUNDLES" echo "Injected TOTAL_BUNDLES = $TOTAL_BUNDLES"
# Minify JS (strip comments + whitespace, keep variable names)
if command -v esbuild &>/dev/null; then
esbuild "$TMPDIR/site.js" --minify --outfile="$TMPDIR/site.js" --allow-overwrite
echo "Minified site.js"
else
echo "Warning: esbuild not found, deploying unminified JS"
fi
# Upload # Upload
echo "Uploading to s3://$SITE_BUCKET/..." echo "Uploading to s3://$SITE_BUCKET/..."
aws s3 cp "$TMPDIR/index.html" "s3://$SITE_BUCKET/" --content-type "text/html" aws s3 cp "$TMPDIR/index.html" "s3://$SITE_BUCKET/" --content-type "text/html"
@ -80,6 +89,28 @@ echo "Uploaded 4 files"
rm -rf "$TMPDIR" rm -rf "$TMPDIR"
# Clean up stale bundles from previous runs
echo "Cleaning stale bundles above $TOTAL_BUNDLES..."
STALE=$(aws s3api list-objects-v2 --bucket "$SITE_BUCKET" --prefix "tabs/" --query "Contents[].Key" --output text \
| tr '\t' '\n' \
| while read -r key; do
num=$(echo "$key" | grep -oP '\d+' || true)
if [ -n "$num" ] && [ "$((10#$num))" -ge "$TOTAL_BUNDLES" ]; then
echo "$key"
fi
done)
if [ -n "$STALE" ]; then
STALE_COUNT=$(echo "$STALE" | wc -l)
echo "Deleting $STALE_COUNT stale bundles..."
echo "$STALE" | while read -r key; do
aws s3 rm "s3://$SITE_BUCKET/$key" --quiet
done
echo "Deleted $STALE_COUNT stale bundles"
else
echo "No stale bundles found"
fi
# Invalidate CloudFront # Invalidate CloudFront
if ! $SKIP_INVALIDATION; then if ! $SKIP_INVALIDATION; then
if [ -z "$DIST_ID" ]; then if [ -z "$DIST_ID" ]; then

View file

@ -11,10 +11,9 @@ Between stages, run the sanity checks to confirm data looks right before proceed
export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab' export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
# Go binaries built on EC2 # Go binaries built on EC2
cd ~/everytab go build -o ~/warc_parse ./everytab/pipeline/02_warc_parse/
go build -o ~/warc_parse ./pipeline/02_warc_parse/ go build -o ~/icon_download ./everytab/pipeline/03_icon_download/
go build -o ~/icon_download ./pipeline/03_icon_download/ go build -o ~/bundle_gen ./everytab/pipeline/05_bundle_gen/
go build -o ~/bundle_gen ./pipeline/05_bundle_gen/
``` ```
## Stage 1: CC-Index Query ## Stage 1: CC-Index Query
@ -22,7 +21,7 @@ go build -o ~/bundle_gen ./pipeline/05_bundle_gen/
Populates the `hosts` table from Common Crawl's columnar index. Populates the `hosts` table from Common Crawl's columnar index.
```bash ```bash
./pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000 ./everytab/pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000
# Full run: --limit 0 # Full run: --limit 0
``` ```
@ -47,7 +46,7 @@ GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.l
Picks the best icon per host for display. Picks the best icon per host for display.
```bash ```bash
psql $DATABASE_URL -f pipeline/04_best_icon/select.sql psql $DATABASE_URL -f ./everytab/pipeline/04_best_icon/select.sql
``` ```
## Stage 5: Bundle Generation ## Stage 5: Bundle Generation
@ -62,12 +61,20 @@ Note the `TOTAL_BUNDLES` number from the summary — this gets baked into the fr
## Stage 6: Frontend Deploy ## Stage 6: Frontend Deploy
From your local machine: From EC2, after bundle gen completes:
```bash ```bash
./pipeline/06_frontend/deploy.sh --total-bundles <NUMBER> TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json)
./everytab/pipeline/06_frontend/deploy.sh --total-bundles "$TOTAL_BUNDLES"
``` ```
The deploy script:
1. Injects TOTAL_BUNDLES into index.html
2. Minifies site.js (via esbuild, strips comments + whitespace)
3. Uploads frontend files to S3
4. Deletes stale bundles from previous runs (numbers ≥ TOTAL_BUNDLES)
5. Invalidates CloudFront cache
## Stage 7: Backup to Homelab ## Stage 7: Backup to Homelab
After the site is deployed and verified, backup data before tearing down scanning infra. After the site is deployed and verified, backup data before tearing down scanning infra.
@ -76,7 +83,7 @@ After the site is deployed and verified, backup data before tearing down scannin
| Data | Location on EC2 | Size estimate (30M) | Purpose | | Data | Location on EC2 | Size estimate (30M) | Purpose |
|------|----------------|---------------------|---------| |------|----------------|---------------------|---------|
| Database | RDS (pg_dump) | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates | | Database | pg_dump from i3 instance | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates |
| Icons | `~/icons/` directory | ~500GB-1TB | Complete favicon archive, content-addressed by SHA-256 | | Icons | `~/icons/` directory | ~500GB-1TB | Complete favicon archive, content-addressed by SHA-256 |
| Stats | `~/stats/*.json` | <1MB | Pipeline timing and counts per stage | | Stats | `~/stats/*.json` | <1MB | Pipeline timing and counts per stage |
| Logs | `~/*.log` | varies | Error logs for debugging | | Logs | `~/*.log` | varies | Error logs for debugging |