104 lines
2.9 KiB
Bash
Executable file
104 lines
2.9 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Full pipeline run — chain all stages sequentially.
|
|
# Run in tmux. Monitor from another pane with psql or htop.
|
|
# Stops on any failure. Resume by commenting out completed stages.
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: ./pipeline/run.sh --db-url DATABASE_URL [OPTIONS]
|
|
|
|
Required:
|
|
--db-url URL Postgres connection string
|
|
|
|
Optional:
|
|
--limit N CC-Index host limit (default: 0 = all)
|
|
--icons-dir DIR Icon storage directory (default: ~/icons)
|
|
--site-bucket NAME S3 bucket for bundles (default: everytab-site)
|
|
--help Show this help message
|
|
|
|
Example:
|
|
./pipeline/run.sh --db-url "$DATABASE_URL"
|
|
./pipeline/run.sh --db-url "$DATABASE_URL" --limit 3000000
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
# Defaults
|
|
DB_URL=""
|
|
LIMIT=0
|
|
ICONS_DIR="$HOME/icons"
|
|
SITE_BUCKET="everytab-site"
|
|
|
|
if [ $# -eq 0 ]; then usage; fi
|
|
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--help) usage ;;
|
|
--db-url) DB_URL="$2"; shift 2 ;;
|
|
--limit) LIMIT="$2"; shift 2 ;;
|
|
--icons-dir) ICONS_DIR="$2"; shift 2 ;;
|
|
--site-bucket) SITE_BUCKET="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1"; usage ;;
|
|
esac
|
|
done
|
|
|
|
if [ -z "$DB_URL" ]; then
|
|
echo "ERROR: --db-url is required"
|
|
exit 1
|
|
fi
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
REPO_DIR="$(dirname "$SCRIPT_DIR")"
|
|
START_TIME=$(date +%s)
|
|
|
|
echo "=========================================="
|
|
echo " EveryTab Pipeline"
|
|
echo " $(date)"
|
|
echo " Limit: $LIMIT (0 = full run)"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# --- Stage 1: CC-Index ---
|
|
echo ">>> Stage 1: CC-Index Query"
|
|
"$SCRIPT_DIR/01_cc_index/query.sh" --db-url "$DB_URL" --limit "$LIMIT"
|
|
echo ""
|
|
|
|
# --- Stage 2: WARC Parsing ---
|
|
echo ">>> Stage 2: WARC Parsing"
|
|
~/warc_parse --db "$DB_URL" --log-file "$HOME/warc_parse.log" --log-errors-only
|
|
echo ""
|
|
|
|
# --- Stage 3: Icon Download ---
|
|
echo ">>> Stage 3: Icon Download"
|
|
GOMEMLIMIT=12GiB ~/icon_download --db "$DB_URL" --icons-dir "$ICONS_DIR" --log-file "$HOME/icon_download.log" --log-errors-only
|
|
echo ""
|
|
|
|
# --- Stage 4: Best Icon Selection ---
|
|
echo ">>> Stage 4: Best Icon Selection"
|
|
psql "$DB_URL" -f "$SCRIPT_DIR/04_best_icon/select.sql"
|
|
echo ""
|
|
|
|
# --- Stage 5: Bundle Generation ---
|
|
echo ">>> Stage 5: Bundle Generation"
|
|
~/bundle_gen --db "$DB_URL" --icons-dir "$ICONS_DIR" --site-bucket "$SITE_BUCKET" --log-file "$HOME/bundle_gen.log" --log-errors-only
|
|
echo ""
|
|
|
|
# --- Stage 6: Frontend Deploy ---
|
|
echo ">>> Stage 6: Frontend Deploy"
|
|
TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json)
|
|
"$SCRIPT_DIR/06_frontend/deploy.sh" --total-bundles "$TOTAL_BUNDLES"
|
|
echo ""
|
|
|
|
# --- Done ---
|
|
END_TIME=$(date +%s)
|
|
DURATION=$(( END_TIME - START_TIME ))
|
|
HOURS=$(( DURATION / 3600 ))
|
|
MINS=$(( (DURATION % 3600) / 60 ))
|
|
|
|
echo "=========================================="
|
|
echo " Pipeline Complete"
|
|
echo " $(date)"
|
|
echo " Total duration: ${HOURS}h ${MINS}m"
|
|
echo "=========================================="
|