From 1afbc41599aa59b175592d40a742eb84bdf00e1a Mon Sep 17 00:00:00 2001
From: Joe Lothan <joe@lothan.net>
Date: Mon, 25 May 2026 18:29:37 -0400
Subject: [PATCH] automated ec2 setup and build

---
 infra/README.md                | 78 ++++++++++++++++++++--------------
 infra/db-setup.sh              | 15 +++----
 infra/ec2-userdata.sh          | 38 +++++++++++++++--
 infra/main.tf                  | 11 +++++
 infra/terraform.tfvars.example | 10 +++--
 5 files changed, 103 insertions(+), 49 deletions(-)
diff --git a/infra/README.md b/infra/README.md
index 4c4d02b..ca759b0 100644
--- a/infra/README.md
+++ b/infra/README.md
@@ -1,5 +1,15 @@
 # Infrastructure Setup
 
+## Architecture
+
+Two EC2 instances during scanning:
+- **c5.2xlarge** (`everytab`) — compute: runs pipeline, stores icons on 1TB EBS
+- **i3.large** (`everytab-db`) — database: runs Postgres on 475GB local NVMe (100K+ IOPS)
+
+Both provisioned by Terraform with `user_data` scripts that run on first boot:
+- Compute: `ec2-userdata.sh` (Go, DuckDB, Unbound, swap)
+- Database: `db-setup.sh` (NVMe format, Postgres install + config)
+
 ## 1. Terraform
 
 ```bash
@@ -9,60 +19,66 @@ terraform init
 terraform apply
 ```
 
+This creates both instances. They auto-provision via user_data (~3 minutes).
+
 ## 2. SSH Key
 
 ```bash
 terraform output -raw ssh_private_key > everytab-key && chmod 600 everytab-key
-terraform output ssh_command  # prints the ssh command
+terraform output ssh_command     # SSH to compute instance
+terraform output ssh_command_db  # SSH to database instance
 ```
 
-## 3. Bootstrap EC2
+## 3. Verify Database is Ready
 
 ```bash
-scp -i everytab-key ec2-userdata.sh ec2-user@<IP>:~
-ssh -i everytab-key ec2-user@<IP> 'bash ~/ec2-userdata.sh'
+# From your local machine or the compute instance
+pg_isready -h $(terraform output -raw db_private_ip)
 ```
 
-## 4. Clone Repo on EC2
+If not ready yet, SSH to the DB instance and check `cloud-init` logs:
+```bash
+tail -f /var/log/cloud-init-output.log
+```
+
+## 4. Clone Repo + Build on Compute Instance
 
 ```bash
+ssh -i everytab-key ec2-user@$(terraform output -raw ec2_public_ip)
+
 git clone <your-repo-url> ~/everytab
 cd ~/everytab
+go build -o ~/warc_parse ./pipeline/02_warc_parse/
+go build -o ~/icon_download ./pipeline/03_icon_download/
+go build -o ~/bundle_gen ./pipeline/05_bundle_gen/
 ```
 
-## 5. Database Instance (i3.large)
-
-Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck.
+## 5. Connect to Database + Apply Schema
 
 ```bash
-# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG)
-# Then SSH in and run:
-bash ~/everytab/infra/db-setup.sh
-```
-
-This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema.
-
-On the **compute instance** (c5.2xlarge):
-
-```bash
-# Use the private IP printed by db-setup.sh
-echo "export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'" >> ~/.bashrc
-source ~/.bashrc
+# Get the connection string
+export DATABASE_URL=$(terraform output -raw database_url)
+echo "export DATABASE_URL='$DATABASE_URL'" >> ~/.bashrc
 
 # Test connectivity
 psql $DATABASE_URL -c 'SELECT 1;'
+
+# Apply schema
+psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql
 ```
 
-Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown.
+## 6. Run Pipeline
+
+See `pipeline/README.md` for the full stage-by-stage guide.
 
 ## Pinning the EC2 AMI
 
-The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance.
+The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your instances.
 
 To prevent this, pin the AMI after initial creation:
 
 ```bash
-# Get the current instance's AMI
+# Get the current AMI
 aws ec2 describe-instances --filters "Name=tag:Name,Values=everytab" \
   --query "Reservations[0].Instances[0].ImageId" --output text
 
@@ -70,21 +86,19 @@ aws ec2 describe-instances --filters "Name=tag:Name,Values=everytab" \
 echo 'ec2_ami = "ami-XXXXXXXXXXXX"' >> terraform.tfvars
 ```
 
-Now `terraform apply` won't replace the instance for non-EC2 changes (like adding CloudFront logging).
-
-Remove the `ec2_ami` line from tfvars when you want a fresh instance with the latest AMI (e.g., after teardown).
+Remove the `ec2_ami` line from tfvars when you want fresh instances with the latest AMI.
 
 ## Teardown (after backup)
 
 ```bash
-# Back up the database first
-pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc
+# Back up the database (run from compute instance)
+pg_dump $DATABASE_URL -Fc > ~/everytab_dump.pgfc
 
-# Back up icons
+# Back up icons to homelab
 rsync -avP ~/icons/ homelab:/backups/everytab/icons/
 ```
 
-Switch to serving-only mode (destroys EC2, icons bucket):
+Switch to serving-only mode (destroys both EC2 instances):
 
 ```bash
 terraform apply -var="scanning=false"
@@ -95,3 +109,5 @@ Full destroy (including the live site):
 ```bash
 terraform destroy
 ```
+
+**IMPORTANT:** The i3's local NVMe is ephemeral — all data is lost on stop/terminate. Always pg_dump before teardown.
diff --git a/infra/db-setup.sh b/infra/db-setup.sh
index 0874d28..5341024 100755
--- a/infra/db-setup.sh
+++ b/infra/db-setup.sh
@@ -122,16 +122,11 @@ echo "--- Creating everytab database ---"
 sudo -u postgres psql -p 5432 -c "CREATE USER everytab;" 2>/dev/null || true
 sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true
 
-# --- Apply schema ---
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql"
-if [ -f "$SCHEMA" ]; then
-    echo "--- Applying schema ---"
-    psql -U everytab -h localhost -d everytab -f "$SCHEMA"
-else
-    echo "Warning: schema.sql not found at $SCHEMA"
-    echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql"
-fi
+# --- Schema ---
+# Schema is applied from the compute instance over the network:
+#   psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql
+echo "--- Schema ---"
+echo "Apply schema from compute instance: psql \$DATABASE_URL -f pipeline/01_cc_index/schema.sql"
 
 # --- Validate ---
 echo ""
diff --git a/infra/ec2-userdata.sh b/infra/ec2-userdata.sh
index bbf9fa9..56bce25 100755
--- a/infra/ec2-userdata.sh
+++ b/infra/ec2-userdata.sh
@@ -136,11 +136,41 @@ echo -n "DuckDB: "; duckdb -c "SELECT version();" -noheader -csv
 echo -n "Unbound: "; dig +short example.com @127.0.0.1 | head -1
 echo -n "psql: "; psql --version
 
+echo ""
+
+# --- Database Connection ---
+DB_IP="${db_private_ip}"
+export DATABASE_URL="postgres://everytab@$${DB_IP}:5432/everytab"
+echo "export DATABASE_URL='postgres://everytab@$${DB_IP}:5432/everytab'" >> /home/ec2-user/.bashrc
+
+# --- Clone Repo + Build ---
+REPO_URL="${repo_url}"
+if [ -n "$REPO_URL" ]; then
+    echo "--- Cloning repo ---"
+    sudo -u ec2-user git clone "$REPO_URL" /home/ec2-user/everytab
+    cd /home/ec2-user/everytab
+
+    echo "--- Building Go binaries ---"
+    sudo -u ec2-user bash -c 'export PATH=$PATH:/usr/local/go/bin && cd ~/everytab && go build -o ~/warc_parse ./pipeline/02_warc_parse/ && go build -o ~/icon_download ./pipeline/03_icon_download/ && go build -o ~/bundle_gen ./pipeline/05_bundle_gen/'
+
+    # Wait for DB to be ready, then apply schema
+    echo "--- Waiting for database ---"
+    for i in $(seq 1 60); do
+        if pg_isready -h "$DB_IP" -q 2>/dev/null; then
+            echo "Database ready"
+            sudo -u ec2-user psql "$DATABASE_URL" -f /home/ec2-user/everytab/pipeline/01_cc_index/schema.sql
+            echo "Schema applied"
+            break
+        fi
+        sleep 5
+    done
+else
+    echo "No repo_url set — clone manually"
+fi
+
 echo ""
 echo "=== Bootstrap Complete ==="
 echo ""
-echo "Next: set up your database connection string."
-echo "  export DATABASE_URL='postgres://everytab:PASSWORD@RDS_ENDPOINT:5432/everytab'"
+echo "DATABASE_URL=$DATABASE_URL"
 echo ""
-echo "Test connection:"
-echo "  psql \$DATABASE_URL -c 'SELECT 1;'"
+echo "Ready to run the pipeline. See pipeline/README.md for usage."
diff --git a/infra/main.tf b/infra/main.tf
index a05f082..1b3d7dd 100644
--- a/infra/main.tf
+++ b/infra/main.tf
@@ -66,6 +66,12 @@ variable "domain" {
   default     = "everytab.site"
 }
 
+variable "repo_url" {
+  description = "Git repo URL for the pipeline code (public)"
+  type        = string
+  default     = ""
+}
+
 # --- Data sources ---
 
 data "aws_ami" "al2023" {
@@ -338,6 +344,7 @@ resource "aws_instance" "db" {
   key_name             = aws_key_pair.ec2[0].key_name
   vpc_security_group_ids = [aws_security_group.db[0].id]
   subnet_id            = var.subnet_ids[0]
+  user_data            = file("${path.module}/db-setup.sh")
 
   tags = {
     Name = "everytab-db"
@@ -354,6 +361,10 @@ resource "aws_instance" "main" {
   vpc_security_group_ids = [aws_security_group.ec2[0].id]
   subnet_id            = var.subnet_ids[0]
   iam_instance_profile = aws_iam_instance_profile.ec2[0].name
+  user_data = templatefile("${path.module}/ec2-userdata.sh", {
+    db_private_ip = aws_instance.db[0].private_ip
+    repo_url      = var.repo_url
+  })
 
   root_block_device {
     volume_size = 1000
diff --git a/infra/terraform.tfvars.example b/infra/terraform.tfvars.example
index eae77ab..8e7e383 100644
--- a/infra/terraform.tfvars.example
+++ b/infra/terraform.tfvars.example
@@ -1,10 +1,12 @@
 # Copy to terraform.tfvars and fill in your values
 vpc_id     = "vpc-0abc123def456"
-subnet_ids = ["subnet-0abc123", "subnet-0def456"]  # 2+ subnets in different AZs
-db_password = "change-me-to-something-secure"
+subnet_ids = ["subnet-0abc123", "subnet-0def456"]  # At least 1 subnet
 ssh_cidr    = "203.0.113.50/32"  # Your home IP
+repo_url    = "https://github.com/youruser/everytab.git"  # Public repo — enables auto clone+build
 
 # Optional overrides:
-# region           = "us-east-1"
-# ec2_instance_type = "c5.xlarge"
+# region            = "us-east-1"
+# ec2_instance_type = "c5.2xlarge"
+# db_instance_type  = "i3.large"
 # scanning          = true
+# ec2_ami           = "ami-XXXXXXXXXXXX"  # Pin to prevent instance replacement