add one run.sh for the entire pipeline
This commit is contained in:
parent
767083fb5e
commit
79881fce4b
1 changed files with 104 additions and 0 deletions
104
pipeline/run.sh
Executable file
104
pipeline/run.sh
Executable file
|
|
@ -0,0 +1,104 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Full pipeline run — chain all stages sequentially.
|
||||
# Run in tmux. Monitor from another pane with psql or htop.
|
||||
# Stops on any failure. Resume by commenting out completed stages.
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: ./pipeline/run.sh --db-url DATABASE_URL [OPTIONS]
|
||||
|
||||
Required:
|
||||
--db-url URL Postgres connection string
|
||||
|
||||
Optional:
|
||||
--limit N CC-Index host limit (default: 0 = all)
|
||||
--icons-dir DIR Icon storage directory (default: ~/icons)
|
||||
--site-bucket NAME S3 bucket for bundles (default: everytab-site)
|
||||
--help Show this help message
|
||||
|
||||
Example:
|
||||
./pipeline/run.sh --db-url "$DATABASE_URL"
|
||||
./pipeline/run.sh --db-url "$DATABASE_URL" --limit 3000000
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Defaults
|
||||
DB_URL=""
|
||||
LIMIT=0
|
||||
ICONS_DIR="$HOME/icons"
|
||||
SITE_BUCKET="everytab-site"
|
||||
|
||||
if [ $# -eq 0 ]; then usage; fi
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--help) usage ;;
|
||||
--db-url) DB_URL="$2"; shift 2 ;;
|
||||
--limit) LIMIT="$2"; shift 2 ;;
|
||||
--icons-dir) ICONS_DIR="$2"; shift 2 ;;
|
||||
--site-bucket) SITE_BUCKET="$2"; shift 2 ;;
|
||||
*) echo "Unknown option: $1"; usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$DB_URL" ]; then
|
||||
echo "ERROR: --db-url is required"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
REPO_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
echo "=========================================="
|
||||
echo " EveryTab Pipeline"
|
||||
echo " $(date)"
|
||||
echo " Limit: $LIMIT (0 = full run)"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# --- Stage 1: CC-Index ---
|
||||
echo ">>> Stage 1: CC-Index Query"
|
||||
"$SCRIPT_DIR/01_cc_index/query.sh" --db-url "$DB_URL" --limit "$LIMIT"
|
||||
echo ""
|
||||
|
||||
# --- Stage 2: WARC Parsing ---
|
||||
echo ">>> Stage 2: WARC Parsing"
|
||||
~/warc_parse --db "$DB_URL" --log-file "$HOME/warc_parse.log" --log-errors-only
|
||||
echo ""
|
||||
|
||||
# --- Stage 3: Icon Download ---
|
||||
echo ">>> Stage 3: Icon Download"
|
||||
GOMEMLIMIT=12GiB ~/icon_download --db "$DB_URL" --icons-dir "$ICONS_DIR" --log-file "$HOME/icon_download.log" --log-errors-only
|
||||
echo ""
|
||||
|
||||
# --- Stage 4: Best Icon Selection ---
|
||||
echo ">>> Stage 4: Best Icon Selection"
|
||||
psql "$DB_URL" -f "$SCRIPT_DIR/04_best_icon/select.sql"
|
||||
echo ""
|
||||
|
||||
# --- Stage 5: Bundle Generation ---
|
||||
echo ">>> Stage 5: Bundle Generation"
|
||||
~/bundle_gen --db "$DB_URL" --icons-dir "$ICONS_DIR" --site-bucket "$SITE_BUCKET" --log-file "$HOME/bundle_gen.log" --log-errors-only
|
||||
echo ""
|
||||
|
||||
# --- Stage 6: Frontend Deploy ---
|
||||
echo ">>> Stage 6: Frontend Deploy"
|
||||
TOTAL_BUNDLES=$(jq -r '.bundles_created' stats/05_bundle_gen.json)
|
||||
"$SCRIPT_DIR/06_frontend/deploy.sh" --total-bundles "$TOTAL_BUNDLES"
|
||||
echo ""
|
||||
|
||||
# --- Done ---
|
||||
END_TIME=$(date +%s)
|
||||
DURATION=$(( END_TIME - START_TIME ))
|
||||
HOURS=$(( DURATION / 3600 ))
|
||||
MINS=$(( (DURATION % 3600) / 60 ))
|
||||
|
||||
echo "=========================================="
|
||||
echo " Pipeline Complete"
|
||||
echo " $(date)"
|
||||
echo " Total duration: ${HOURS}h ${MINS}m"
|
||||
echo "=========================================="
|
||||
Loading…
Add table
Add a link
Reference in a new issue