#!/usr/bin/env bash set -euo pipefail # EveryTab EC2 Bootstrap # Runs automatically via cloud-init user_data on first boot. # Installs: Go, DuckDB, Unbound, psql, pg_dump export HOME=/root echo "=== EveryTab EC2 Bootstrap ===" # --- EBS readahead --- # Large readahead improves bundle gen throughput by prefetching icon files into page cache. # 16MB readahead (32768 sectors × 512 bytes). Safe for all pipeline stages. echo "--- Setting EBS readahead ---" ROOT_DEV=$(findmnt -no SOURCE / | sed 's/p[0-9]*$//') sudo blockdev --setra 32768 "$ROOT_DEV" echo "Readahead: $(blockdev --getra "$ROOT_DEV") sectors on $ROOT_DEV" # --- File descriptor limits --- echo "--- Raising file descriptor limits ---" echo '* soft nofile 65536' | sudo tee -a /etc/security/limits.conf echo '* hard nofile 65536' | sudo tee -a /etc/security/limits.conf # Also set for current session ulimit -n 65536 echo "File descriptor limit: $(ulimit -n)" # --- Swap --- echo "--- Creating swap file ---" if [ ! -f /swapfile ]; then sudo dd if=/dev/zero of=/swapfile bs=1M count=8192 sudo chmod 600 /swapfile sudo mkswap /swapfile sudo swapon /swapfile echo '/swapfile swap swap defaults 0 0' | sudo tee -a /etc/fstab echo "Created 4GB swap" else echo "Swap already exists" fi # --- System packages --- echo "--- Installing system packages ---" sudo dnf update -y sudo dnf install -y \ gcc \ git \ postgresql16 \ unbound \ jq \ htop \ iftop \ iotop \ tmux # --- Go --- echo "--- Installing Go ---" GO_VERSION="1.22.4" if ! command -v go &>/dev/null; then curl -fsSL "https://go.dev/dl/go$${GO_VERSION}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xz echo 'export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin' >> ~/.bashrc export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin fi go version # --- esbuild --- echo "--- Installing esbuild ---" if ! command -v esbuild &>/dev/null; then GOBIN=/usr/local/bin /usr/local/go/bin/go install github.com/evanw/esbuild/cmd/esbuild@latest fi esbuild --version # --- DuckDB --- echo "--- Installing DuckDB ---" DUCKDB_VERSION="1.5.2" if ! command -v duckdb &>/dev/null; then curl -fsSL "https://github.com/duckdb/duckdb/releases/download/v$${DUCKDB_VERSION}/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip cd /tmp && unzip -o duckdb.zip && sudo mv duckdb /usr/local/bin/ && cd - fi duckdb -c "SELECT 'DuckDB OK';" # Install DuckDB extensions duckdb -c "INSTALL httpfs; INSTALL postgres;" echo "DuckDB extensions installed" # --- Unbound --- echo "--- Configuring Unbound ---" sudo tee /etc/unbound/unbound.conf > /dev/null <<'UNBOUNDCONF' server: interface: 127.0.0.1 port: 53 access-control: 127.0.0.0/8 allow # Performance num-threads: 4 msg-cache-slabs: 4 rrset-cache-slabs: 4 infra-cache-slabs: 4 key-cache-slabs: 4 # Cache sizing — values are per-slab (×4 slabs), so total = 4× these values # Only need ~500MB total — TLD/NS records stay hot via LRU, individual A records evict naturally msg-cache-size: 32m rrset-cache-size: 64m key-cache-size: 16m # Aggressive caching cache-min-ttl: 3600 cache-max-ttl: 86400 prefetch: yes prefetch-key: yes # Hardening hide-identity: yes hide-version: yes harden-glue: yes harden-dnssec-stripped: yes # Logging (minimal) verbosity: 1 log-queries: no # Root hints root-hints: "/etc/unbound/root.hints" remote-control: control-enable: yes control-interface: 127.0.0.1 UNBOUNDCONF # Download root hints sudo curl -fsSL https://www.internic.net/domain/named.root -o /etc/unbound/root.hints # Disable systemd-resolved if present (it manages resolv.conf on AL2023) if systemctl is-active --quiet systemd-resolved 2>/dev/null; then sudo systemctl disable --now systemd-resolved fi # Set system resolver to use Unbound sudo rm -f /etc/resolv.conf echo "nameserver 127.0.0.1" | sudo tee /etc/resolv.conf > /dev/null # Start and enable Unbound sudo systemctl enable unbound sudo systemctl restart unbound # Generate control keys for unbound-control stats sudo unbound-control-setup 2>/dev/null || true echo "" # --- Validation --- echo "=== Validation ===" echo -n "Go: "; go version echo -n "DuckDB: "; duckdb -c "SELECT version();" -noheader -csv echo -n "Unbound: "; dig +short example.com @127.0.0.1 | head -1 echo -n "psql: "; psql --version echo "" # --- Database Connection --- DB_IP="${db_private_ip}" export DATABASE_URL="postgres://everytab@$${DB_IP}:5432/everytab" echo "export DATABASE_URL='postgres://everytab@$${DB_IP}:5432/everytab'" >> /home/ec2-user/.bashrc # --- Clone Repo + Build --- REPO_URL="${repo_url}" if [ -n "$REPO_URL" ]; then echo "--- Cloning repo ---" sudo -u ec2-user git clone "$REPO_URL" /home/ec2-user/everytab cd /home/ec2-user/everytab echo "--- Building Go binaries ---" sudo -u ec2-user bash -c 'export PATH=/usr/local/go/bin:$PATH && cd /home/ec2-user/everytab && go build -o /home/ec2-user/warc_parse ./pipeline/02_warc_parse/ && go build -o /home/ec2-user/icon_download ./pipeline/03_icon_download/ && go build -o /home/ec2-user/bundle_gen ./pipeline/05_bundle_gen/' # Wait for DB to be ready, then apply schema echo "--- Waiting for database ---" for i in $(seq 1 60); do if pg_isready -h "$DB_IP" -q 2>/dev/null; then echo "Database ready" sudo -u ec2-user psql "$DATABASE_URL" -f /home/ec2-user/everytab/pipeline/01_cc_index/schema.sql echo "Schema applied" break fi sleep 5 done else echo "No repo_url set — clone manually" fi echo "" echo "=== Bootstrap Complete ===" echo "" echo "DATABASE_URL=$DATABASE_URL" echo "" echo "Ready to run the pipeline. See pipeline/README.md for usage."