diff --git a/pipeline/run.sh b/pipeline/run.sh new file mode 100755 index 0000000..ca91570 --- /dev/null +++ b/pipeline/run.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Full pipeline run — chain all stages sequentially. +# Run in tmux. Monitor from another pane with psql or htop. +# Stops on any failure. Resume by commenting out completed stages. + +usage() { + cat <<'EOF' +Usage: ./pipeline/run.sh --db-url DATABASE_URL [OPTIONS] + +Required: + --db-url URL Postgres connection string + +Optional: + --limit N CC-Index host limit (default: 0 = all) + --icons-dir DIR Icon storage directory (default: ~/icons) + --site-bucket NAME S3 bucket for bundles (default: everytab-site) + --help Show this help message + +Example: + ./pipeline/run.sh --db-url "$DATABASE_URL" + ./pipeline/run.sh --db-url "$DATABASE_URL" --limit 3000000 +EOF + exit 0 +} + +# Defaults +DB_URL="" +LIMIT=0 +ICONS_DIR="$HOME/icons" +SITE_BUCKET="everytab-site" + +if [ $# -eq 0 ]; then usage; fi + +while [ $# -gt 0 ]; do + case "$1" in + --help) usage ;; + --db-url) DB_URL="$2"; shift 2 ;; + --limit) LIMIT="$2"; shift 2 ;; + --icons-dir) ICONS_DIR="$2"; shift 2 ;; + --site-bucket) SITE_BUCKET="$2"; shift 2 ;; + *) echo "Unknown option: $1"; usage ;; + esac +done + +if [ -z "$DB_URL" ]; then + echo "ERROR: --db-url is required" + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_DIR="$(dirname "$SCRIPT_DIR")" +START_TIME=$(date +%s) + +echo "==========================================" +echo " EveryTab Pipeline" +echo " $(date)" +echo " Limit: $LIMIT (0 = full run)" +echo "==========================================" +echo "" + +# --- Stage 1: CC-Index --- +echo ">>> Stage 1: CC-Index Query" +"$SCRIPT_DIR/01_cc_index/query.sh" --db-url "$DB_URL" --limit "$LIMIT" +echo "" + +# --- Stage 2: WARC Parsing --- +echo ">>> Stage 2: WARC Parsing" +~/warc_parse --db "$DB_URL" --log-file "$HOME/warc_parse.log" --log-errors-only +echo "" + +# --- Stage 3: Icon Download --- +echo ">>> Stage 3: Icon Download" +GOMEMLIMIT=12GiB ~/icon_download --db "$DB_URL" --icons-dir "$ICONS_DIR" --log-file "$HOME/icon_download.log" --log-errors-only +echo "" + +# --- Stage 4: Best Icon Selection --- +echo ">>> Stage 4: Best Icon Selection" +psql "$DB_URL" -f "$SCRIPT_DIR/04_best_icon/select.sql" +echo "" + +# --- Stage 5: Bundle Generation --- +echo ">>> Stage 5: Bundle Generation" +~/bundle_gen --db "$DB_URL" --icons-dir "$ICONS_DIR" --site-bucket "$SITE_BUCKET" --log-file "$HOME/bundle_gen.log" --log-errors-only +echo "" + +# --- Stage 6: Frontend Deploy --- +echo ">>> Stage 6: Frontend Deploy" +TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json) +"$SCRIPT_DIR/06_frontend/deploy.sh" --total-bundles "$TOTAL_BUNDLES" +echo "" + +# --- Done --- +END_TIME=$(date +%s) +DURATION=$(( END_TIME - START_TIME )) +HOURS=$(( DURATION / 3600 )) +MINS=$(( (DURATION % 3600) / 60 )) + +echo "==========================================" +echo " Pipeline Complete" +echo " $(date)" +echo " Total duration: ${HOURS}h ${MINS}m" +echo "=========================================="