#!/usr/bin/env bash set -euo pipefail # Full pipeline run — chain all stages sequentially. # Run in tmux. Monitor from another pane with psql or htop. # Stops on any failure. Resume by commenting out completed stages. usage() { cat <<'EOF' Usage: ./pipeline/run.sh --db-url DATABASE_URL [OPTIONS] Required: --db-url URL Postgres connection string Optional: --limit N CC-Index host limit (default: 0 = all) --icons-dir DIR Icon storage directory (default: ~/icons) --site-bucket NAME S3 bucket for bundles (default: everytab-site) --help Show this help message Example: ./pipeline/run.sh --db-url "$DATABASE_URL" ./pipeline/run.sh --db-url "$DATABASE_URL" --limit 3000000 EOF exit 0 } # Defaults DB_URL="" LIMIT=0 ICONS_DIR="$HOME/icons" SITE_BUCKET="everytab-site" if [ $# -eq 0 ]; then usage; fi while [ $# -gt 0 ]; do case "$1" in --help) usage ;; --db-url) DB_URL="$2"; shift 2 ;; --limit) LIMIT="$2"; shift 2 ;; --icons-dir) ICONS_DIR="$2"; shift 2 ;; --site-bucket) SITE_BUCKET="$2"; shift 2 ;; *) echo "Unknown option: $1"; usage ;; esac done if [ -z "$DB_URL" ]; then echo "ERROR: --db-url is required" exit 1 fi SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" REPO_DIR="$(dirname "$SCRIPT_DIR")" START_TIME=$(date +%s) echo "==========================================" echo " EveryTab Pipeline" echo " $(date)" echo " Limit: $LIMIT (0 = full run)" echo "==========================================" echo "" # --- Stage 1: CC-Index --- echo ">>> Stage 1: CC-Index Query" "$SCRIPT_DIR/01_cc_index/query.sh" --db-url "$DB_URL" --limit "$LIMIT" echo "" # --- Stage 2: WARC Parsing --- echo ">>> Stage 2: WARC Parsing" ~/warc_parse --db "$DB_URL" --log-file "$HOME/warc_parse.log" --log-errors-only echo "" # --- Stage 3: Icon Download --- echo ">>> Stage 3: Icon Download" GOMEMLIMIT=12GiB ~/icon_download --db "$DB_URL" --icons-dir "$ICONS_DIR" --log-file "$HOME/icon_download.log" --log-errors-only echo "" # --- Stage 4: Best Icon Selection --- echo ">>> Stage 4: Best Icon Selection" psql "$DB_URL" -f "$SCRIPT_DIR/04_best_icon/select.sql" echo "" # --- Stage 5: Bundle Generation --- echo ">>> Stage 5: Bundle Generation" ~/bundle_gen --db "$DB_URL" --icons-dir "$ICONS_DIR" --site-bucket "$SITE_BUCKET" --log-file "$HOME/bundle_gen.log" --log-errors-only echo "" # --- Stage 6: Frontend Deploy --- echo ">>> Stage 6: Frontend Deploy" TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json) "$SCRIPT_DIR/06_frontend/deploy.sh" --total-bundles "$TOTAL_BUNDLES" echo "" # --- Done --- END_TIME=$(date +%s) DURATION=$(( END_TIME - START_TIME )) HOURS=$(( DURATION / 3600 )) MINS=$(( (DURATION % 3600) / 60 )) echo "==========================================" echo " Pipeline Complete" echo " $(date)" echo " Total duration: ${HOURS}h ${MINS}m" echo "=========================================="