add one run.sh for the entire pipeline

This commit is contained in:
Joe Lothan 2026-05-26 02:09:57 -04:00
parent 767083fb5e
commit 79881fce4b

104
pipeline/run.sh Executable file
View file

@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -euo pipefail
# Full pipeline run — chain all stages sequentially.
# Run in tmux. Monitor from another pane with psql or htop.
# Stops on any failure. Resume by commenting out completed stages.
usage() {
cat <<'EOF'
Usage: ./pipeline/run.sh --db-url DATABASE_URL [OPTIONS]
Required:
--db-url URL Postgres connection string
Optional:
--limit N CC-Index host limit (default: 0 = all)
--icons-dir DIR Icon storage directory (default: ~/icons)
--site-bucket NAME S3 bucket for bundles (default: everytab-site)
--help Show this help message
Example:
./pipeline/run.sh --db-url "$DATABASE_URL"
./pipeline/run.sh --db-url "$DATABASE_URL" --limit 3000000
EOF
exit 0
}
# Defaults
DB_URL=""
LIMIT=0
ICONS_DIR="$HOME/icons"
SITE_BUCKET="everytab-site"
if [ $# -eq 0 ]; then usage; fi
while [ $# -gt 0 ]; do
case "$1" in
--help) usage ;;
--db-url) DB_URL="$2"; shift 2 ;;
--limit) LIMIT="$2"; shift 2 ;;
--icons-dir) ICONS_DIR="$2"; shift 2 ;;
--site-bucket) SITE_BUCKET="$2"; shift 2 ;;
*) echo "Unknown option: $1"; usage ;;
esac
done
if [ -z "$DB_URL" ]; then
echo "ERROR: --db-url is required"
exit 1
fi
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_DIR="$(dirname "$SCRIPT_DIR")"
START_TIME=$(date +%s)
echo "=========================================="
echo " EveryTab Pipeline"
echo " $(date)"
echo " Limit: $LIMIT (0 = full run)"
echo "=========================================="
echo ""
# --- Stage 1: CC-Index ---
echo ">>> Stage 1: CC-Index Query"
"$SCRIPT_DIR/01_cc_index/query.sh" --db-url "$DB_URL" --limit "$LIMIT"
echo ""
# --- Stage 2: WARC Parsing ---
echo ">>> Stage 2: WARC Parsing"
~/warc_parse --db "$DB_URL" --log-file "$HOME/warc_parse.log" --log-errors-only
echo ""
# --- Stage 3: Icon Download ---
echo ">>> Stage 3: Icon Download"
GOMEMLIMIT=12GiB ~/icon_download --db "$DB_URL" --icons-dir "$ICONS_DIR" --log-file "$HOME/icon_download.log" --log-errors-only
echo ""
# --- Stage 4: Best Icon Selection ---
echo ">>> Stage 4: Best Icon Selection"
psql "$DB_URL" -f "$SCRIPT_DIR/04_best_icon/select.sql"
echo ""
# --- Stage 5: Bundle Generation ---
echo ">>> Stage 5: Bundle Generation"
~/bundle_gen --db "$DB_URL" --icons-dir "$ICONS_DIR" --site-bucket "$SITE_BUCKET" --log-file "$HOME/bundle_gen.log" --log-errors-only
echo ""
# --- Stage 6: Frontend Deploy ---
echo ">>> Stage 6: Frontend Deploy"
TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json)
"$SCRIPT_DIR/06_frontend/deploy.sh" --total-bundles "$TOTAL_BUNDLES"
echo ""
# --- Done ---
END_TIME=$(date +%s)
DURATION=$(( END_TIME - START_TIME ))
HOURS=$(( DURATION / 3600 ))
MINS=$(( (DURATION % 3600) / 60 ))
echo "=========================================="
echo " Pipeline Complete"
echo " $(date)"
echo " Total duration: ${HOURS}h ${MINS}m"
echo "=========================================="