everytab/infra/ec2-userdata.sh

195 lines
5.7 KiB
Bash
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
# EveryTab EC2 Bootstrap
# Runs automatically via cloud-init user_data on first boot.
# Installs: Go, DuckDB, Unbound, psql, pg_dump
export HOME=/root
echo "=== EveryTab EC2 Bootstrap ==="
# --- EBS readahead ---
# Large readahead improves bundle gen throughput by prefetching icon files into page cache.
# 16MB readahead (32768 sectors × 512 bytes). Safe for all pipeline stages.
echo "--- Setting EBS readahead ---"
ROOT_DEV=$(findmnt -no SOURCE / | sed 's/p[0-9]*$//')
sudo blockdev --setra 32768 "$ROOT_DEV"
echo "Readahead: $(blockdev --getra "$ROOT_DEV") sectors on $ROOT_DEV"
# --- File descriptor limits ---
echo "--- Raising file descriptor limits ---"
echo '* soft nofile 65536' | sudo tee -a /etc/security/limits.conf
echo '* hard nofile 65536' | sudo tee -a /etc/security/limits.conf
# Also set for current session
ulimit -n 65536
echo "File descriptor limit: $(ulimit -n)"
# --- Swap ---
echo "--- Creating swap file ---"
if [ ! -f /swapfile ]; then
sudo dd if=/dev/zero of=/swapfile bs=1M count=8192
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile swap swap defaults 0 0' | sudo tee -a /etc/fstab
echo "Created 4GB swap"
else
echo "Swap already exists"
fi
# --- System packages ---
echo "--- Installing system packages ---"
sudo dnf update -y
sudo dnf install -y \
gcc \
git \
postgresql16 \
unbound \
jq \
htop \
iftop \
iotop \
tmux
# --- Go ---
echo "--- Installing Go ---"
GO_VERSION="1.22.4"
if ! command -v go &>/dev/null; then
curl -fsSL "https://go.dev/dl/go$${GO_VERSION}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xz
echo 'export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin' >> ~/.bashrc
export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin
fi
go version
# --- esbuild ---
echo "--- Installing esbuild ---"
if ! command -v esbuild &>/dev/null; then
GOBIN=/usr/local/bin /usr/local/go/bin/go install github.com/evanw/esbuild/cmd/esbuild@latest
fi
esbuild --version
# --- DuckDB ---
echo "--- Installing DuckDB ---"
DUCKDB_VERSION="1.5.2"
if ! command -v duckdb &>/dev/null; then
curl -fsSL "https://github.com/duckdb/duckdb/releases/download/v$${DUCKDB_VERSION}/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip
cd /tmp && unzip -o duckdb.zip && sudo mv duckdb /usr/local/bin/ && cd -
fi
duckdb -c "SELECT 'DuckDB OK';"
# Install DuckDB extensions
duckdb -c "INSTALL httpfs; INSTALL postgres;"
echo "DuckDB extensions installed"
# --- Unbound ---
echo "--- Configuring Unbound ---"
sudo tee /etc/unbound/unbound.conf > /dev/null <<'UNBOUNDCONF'
server:
interface: 127.0.0.1
port: 53
access-control: 127.0.0.0/8 allow
# Performance
num-threads: 4
msg-cache-slabs: 4
rrset-cache-slabs: 4
infra-cache-slabs: 4
key-cache-slabs: 4
# Cache sizing — values are per-slab (×4 slabs), so total = 4× these values
# Only need ~500MB total — TLD/NS records stay hot via LRU, individual A records evict naturally
msg-cache-size: 32m
rrset-cache-size: 64m
key-cache-size: 16m
# Aggressive caching
cache-min-ttl: 3600
cache-max-ttl: 86400
prefetch: yes
prefetch-key: yes
# Hardening
hide-identity: yes
hide-version: yes
harden-glue: yes
harden-dnssec-stripped: yes
# Logging (minimal)
verbosity: 1
log-queries: no
# Root hints
root-hints: "/etc/unbound/root.hints"
remote-control:
control-enable: yes
control-interface: 127.0.0.1
UNBOUNDCONF
# Download root hints
sudo curl -fsSL https://www.internic.net/domain/named.root -o /etc/unbound/root.hints
# Disable systemd-resolved if present (it manages resolv.conf on AL2023)
if systemctl is-active --quiet systemd-resolved 2>/dev/null; then
sudo systemctl disable --now systemd-resolved
fi
# Set system resolver to use Unbound
sudo rm -f /etc/resolv.conf
echo "nameserver 127.0.0.1" | sudo tee /etc/resolv.conf > /dev/null
# Start and enable Unbound
sudo systemctl enable unbound
sudo systemctl restart unbound
# Generate control keys for unbound-control stats
sudo unbound-control-setup 2>/dev/null || true
echo ""
# --- Validation ---
echo "=== Validation ==="
echo -n "Go: "; go version
echo -n "DuckDB: "; duckdb -c "SELECT version();" -noheader -csv
echo -n "Unbound: "; dig +short example.com @127.0.0.1 | head -1
echo -n "psql: "; psql --version
echo ""
# --- Database Connection ---
DB_IP="${db_private_ip}"
export DATABASE_URL="postgres://everytab@$${DB_IP}:5432/everytab"
echo "export DATABASE_URL='postgres://everytab@$${DB_IP}:5432/everytab'" >> /home/ec2-user/.bashrc
# --- Clone Repo + Build ---
REPO_URL="${repo_url}"
if [ -n "$REPO_URL" ]; then
echo "--- Cloning repo ---"
sudo -u ec2-user git clone "$REPO_URL" /home/ec2-user/everytab
cd /home/ec2-user/everytab
echo "--- Building Go binaries ---"
sudo -u ec2-user bash -c 'export PATH=/usr/local/go/bin:$PATH && cd /home/ec2-user/everytab && go build -o /home/ec2-user/warc_parse ./pipeline/02_warc_parse/ && go build -o /home/ec2-user/icon_download ./pipeline/03_icon_download/ && go build -o /home/ec2-user/bundle_gen ./pipeline/05_bundle_gen/'
# Wait for DB to be ready, then apply schema
echo "--- Waiting for database ---"
for i in $(seq 1 60); do
if pg_isready -h "$DB_IP" -q 2>/dev/null; then
echo "Database ready"
sudo -u ec2-user psql "$DATABASE_URL" -f /home/ec2-user/everytab/pipeline/01_cc_index/schema.sql
echo "Schema applied"
break
fi
sleep 5
done
else
echo "No repo_url set — clone manually"
fi
echo ""
echo "=== Bootstrap Complete ==="
echo ""
echo "DATABASE_URL=$DATABASE_URL"
echo ""
echo "Ready to run the pipeline. See pipeline/README.md for usage."