deploy frontend from the ec2 at the end of the pipeline
This commit is contained in:
parent
8d62832c1d
commit
8e3907505f
4 changed files with 60 additions and 10 deletions
|
|
@ -52,6 +52,13 @@ if ! command -v go &>/dev/null; then
|
||||||
fi
|
fi
|
||||||
go version
|
go version
|
||||||
|
|
||||||
|
# --- esbuild ---
|
||||||
|
echo "--- Installing esbuild ---"
|
||||||
|
if ! command -v esbuild &>/dev/null; then
|
||||||
|
GOBIN=/usr/local/bin /usr/local/go/bin/go install github.com/evanw/esbuild/cmd/esbuild@latest
|
||||||
|
fi
|
||||||
|
esbuild --version
|
||||||
|
|
||||||
# --- DuckDB ---
|
# --- DuckDB ---
|
||||||
echo "--- Installing DuckDB ---"
|
echo "--- Installing DuckDB ---"
|
||||||
DUCKDB_VERSION="1.5.2"
|
DUCKDB_VERSION="1.5.2"
|
||||||
|
|
|
||||||
|
|
@ -190,6 +190,11 @@ resource "aws_iam_role_policy" "s3_access" {
|
||||||
"arn:aws:s3:::commoncrawl",
|
"arn:aws:s3:::commoncrawl",
|
||||||
"arn:aws:s3:::commoncrawl/*",
|
"arn:aws:s3:::commoncrawl/*",
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Effect = "Allow"
|
||||||
|
Action = ["cloudfront:CreateInvalidation", "cloudfront:ListDistributions"]
|
||||||
|
Resource = "*"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -60,16 +60,25 @@ echo "Total bundles: $TOTAL_BUNDLES"
|
||||||
echo "S3 bucket: $SITE_BUCKET"
|
echo "S3 bucket: $SITE_BUCKET"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# Inject TOTAL_BUNDLES into index.html (in a temp copy)
|
# Build into temp directory
|
||||||
TMPDIR=$(mktemp -d)
|
TMPDIR=$(mktemp -d)
|
||||||
cp "$FRONTEND_DIR/index.html" "$TMPDIR/index.html"
|
cp "$FRONTEND_DIR/index.html" "$TMPDIR/index.html"
|
||||||
cp "$FRONTEND_DIR/site.js" "$TMPDIR/site.js"
|
cp "$FRONTEND_DIR/site.js" "$TMPDIR/site.js"
|
||||||
cp "$FRONTEND_DIR/bot.html" "$TMPDIR/bot.html"
|
cp "$FRONTEND_DIR/bot.html" "$TMPDIR/bot.html"
|
||||||
cp "$FRONTEND_DIR/about.html" "$TMPDIR/about.html"
|
cp "$FRONTEND_DIR/about.html" "$TMPDIR/about.html"
|
||||||
|
|
||||||
|
# Inject TOTAL_BUNDLES
|
||||||
sed -i "s/const TOTAL_BUNDLES = [0-9]*/const TOTAL_BUNDLES = ${TOTAL_BUNDLES}/" "$TMPDIR/index.html"
|
sed -i "s/const TOTAL_BUNDLES = [0-9]*/const TOTAL_BUNDLES = ${TOTAL_BUNDLES}/" "$TMPDIR/index.html"
|
||||||
echo "Injected TOTAL_BUNDLES = $TOTAL_BUNDLES"
|
echo "Injected TOTAL_BUNDLES = $TOTAL_BUNDLES"
|
||||||
|
|
||||||
|
# Minify JS (strip comments + whitespace, keep variable names)
|
||||||
|
if command -v esbuild &>/dev/null; then
|
||||||
|
esbuild "$TMPDIR/site.js" --minify --outfile="$TMPDIR/site.js" --allow-overwrite
|
||||||
|
echo "Minified site.js"
|
||||||
|
else
|
||||||
|
echo "Warning: esbuild not found, deploying unminified JS"
|
||||||
|
fi
|
||||||
|
|
||||||
# Upload
|
# Upload
|
||||||
echo "Uploading to s3://$SITE_BUCKET/..."
|
echo "Uploading to s3://$SITE_BUCKET/..."
|
||||||
aws s3 cp "$TMPDIR/index.html" "s3://$SITE_BUCKET/" --content-type "text/html"
|
aws s3 cp "$TMPDIR/index.html" "s3://$SITE_BUCKET/" --content-type "text/html"
|
||||||
|
|
@ -80,6 +89,28 @@ echo "Uploaded 4 files"
|
||||||
|
|
||||||
rm -rf "$TMPDIR"
|
rm -rf "$TMPDIR"
|
||||||
|
|
||||||
|
# Clean up stale bundles from previous runs
|
||||||
|
echo "Cleaning stale bundles above $TOTAL_BUNDLES..."
|
||||||
|
STALE=$(aws s3api list-objects-v2 --bucket "$SITE_BUCKET" --prefix "tabs/" --query "Contents[].Key" --output text \
|
||||||
|
| tr '\t' '\n' \
|
||||||
|
| while read -r key; do
|
||||||
|
num=$(echo "$key" | grep -oP '\d+' || true)
|
||||||
|
if [ -n "$num" ] && [ "$((10#$num))" -ge "$TOTAL_BUNDLES" ]; then
|
||||||
|
echo "$key"
|
||||||
|
fi
|
||||||
|
done)
|
||||||
|
|
||||||
|
if [ -n "$STALE" ]; then
|
||||||
|
STALE_COUNT=$(echo "$STALE" | wc -l)
|
||||||
|
echo "Deleting $STALE_COUNT stale bundles..."
|
||||||
|
echo "$STALE" | while read -r key; do
|
||||||
|
aws s3 rm "s3://$SITE_BUCKET/$key" --quiet
|
||||||
|
done
|
||||||
|
echo "Deleted $STALE_COUNT stale bundles"
|
||||||
|
else
|
||||||
|
echo "No stale bundles found"
|
||||||
|
fi
|
||||||
|
|
||||||
# Invalidate CloudFront
|
# Invalidate CloudFront
|
||||||
if ! $SKIP_INVALIDATION; then
|
if ! $SKIP_INVALIDATION; then
|
||||||
if [ -z "$DIST_ID" ]; then
|
if [ -z "$DIST_ID" ]; then
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,9 @@ Between stages, run the sanity checks to confirm data looks right before proceed
|
||||||
export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
|
export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
|
||||||
|
|
||||||
# Go binaries built on EC2
|
# Go binaries built on EC2
|
||||||
cd ~/everytab
|
go build -o ~/warc_parse ./everytab/pipeline/02_warc_parse/
|
||||||
go build -o ~/warc_parse ./pipeline/02_warc_parse/
|
go build -o ~/icon_download ./everytab/pipeline/03_icon_download/
|
||||||
go build -o ~/icon_download ./pipeline/03_icon_download/
|
go build -o ~/bundle_gen ./everytab/pipeline/05_bundle_gen/
|
||||||
go build -o ~/bundle_gen ./pipeline/05_bundle_gen/
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Stage 1: CC-Index Query
|
## Stage 1: CC-Index Query
|
||||||
|
|
@ -22,7 +21,7 @@ go build -o ~/bundle_gen ./pipeline/05_bundle_gen/
|
||||||
Populates the `hosts` table from Common Crawl's columnar index.
|
Populates the `hosts` table from Common Crawl's columnar index.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000
|
./everytab/pipeline/01_cc_index/query.sh --db-url "$DATABASE_URL" --limit 100000
|
||||||
# Full run: --limit 0
|
# Full run: --limit 0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -47,7 +46,7 @@ GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.l
|
||||||
Picks the best icon per host for display.
|
Picks the best icon per host for display.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
psql $DATABASE_URL -f pipeline/04_best_icon/select.sql
|
psql $DATABASE_URL -f ./everytab/pipeline/04_best_icon/select.sql
|
||||||
```
|
```
|
||||||
|
|
||||||
## Stage 5: Bundle Generation
|
## Stage 5: Bundle Generation
|
||||||
|
|
@ -62,12 +61,20 @@ Note the `TOTAL_BUNDLES` number from the summary — this gets baked into the fr
|
||||||
|
|
||||||
## Stage 6: Frontend Deploy
|
## Stage 6: Frontend Deploy
|
||||||
|
|
||||||
From your local machine:
|
From EC2, after bundle gen completes:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./pipeline/06_frontend/deploy.sh --total-bundles <NUMBER>
|
TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json)
|
||||||
|
./everytab/pipeline/06_frontend/deploy.sh --total-bundles "$TOTAL_BUNDLES"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The deploy script:
|
||||||
|
1. Injects TOTAL_BUNDLES into index.html
|
||||||
|
2. Minifies site.js (via esbuild, strips comments + whitespace)
|
||||||
|
3. Uploads frontend files to S3
|
||||||
|
4. Deletes stale bundles from previous runs (numbers ≥ TOTAL_BUNDLES)
|
||||||
|
5. Invalidates CloudFront cache
|
||||||
|
|
||||||
## Stage 7: Backup to Homelab
|
## Stage 7: Backup to Homelab
|
||||||
|
|
||||||
After the site is deployed and verified, backup data before tearing down scanning infra.
|
After the site is deployed and verified, backup data before tearing down scanning infra.
|
||||||
|
|
@ -76,7 +83,7 @@ After the site is deployed and verified, backup data before tearing down scannin
|
||||||
|
|
||||||
| Data | Location on EC2 | Size estimate (30M) | Purpose |
|
| Data | Location on EC2 | Size estimate (30M) | Purpose |
|
||||||
|------|----------------|---------------------|---------|
|
|------|----------------|---------------------|---------|
|
||||||
| Database | RDS (pg_dump) | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates |
|
| Database | pg_dump from i3 instance | ~5-10GB compressed | Full hosts + icons metadata, titles, WARC coordinates |
|
||||||
| Icons | `~/icons/` directory | ~500GB-1TB | Complete favicon archive, content-addressed by SHA-256 |
|
| Icons | `~/icons/` directory | ~500GB-1TB | Complete favicon archive, content-addressed by SHA-256 |
|
||||||
| Stats | `~/stats/*.json` | <1MB | Pipeline timing and counts per stage |
|
| Stats | `~/stats/*.json` | <1MB | Pipeline timing and counts per stage |
|
||||||
| Logs | `~/*.log` | varies | Error logs for debugging |
|
| Logs | `~/*.log` | varies | Error logs for debugging |
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue