From 1afbc41599aa59b175592d40a742eb84bdf00e1a Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Mon, 25 May 2026 18:29:37 -0400 Subject: [PATCH] automated ec2 setup and build --- infra/README.md | 78 ++++++++++++++++++++-------------- infra/db-setup.sh | 15 +++---- infra/ec2-userdata.sh | 38 +++++++++++++++-- infra/main.tf | 11 +++++ infra/terraform.tfvars.example | 10 +++-- 5 files changed, 103 insertions(+), 49 deletions(-) diff --git a/infra/README.md b/infra/README.md index 4c4d02b..ca759b0 100644 --- a/infra/README.md +++ b/infra/README.md @@ -1,5 +1,15 @@ # Infrastructure Setup +## Architecture + +Two EC2 instances during scanning: +- **c5.2xlarge** (`everytab`) — compute: runs pipeline, stores icons on 1TB EBS +- **i3.large** (`everytab-db`) — database: runs Postgres on 475GB local NVMe (100K+ IOPS) + +Both provisioned by Terraform with `user_data` scripts that run on first boot: +- Compute: `ec2-userdata.sh` (Go, DuckDB, Unbound, swap) +- Database: `db-setup.sh` (NVMe format, Postgres install + config) + ## 1. Terraform ```bash @@ -9,60 +19,66 @@ terraform init terraform apply ``` +This creates both instances. They auto-provision via user_data (~3 minutes). + ## 2. SSH Key ```bash terraform output -raw ssh_private_key > everytab-key && chmod 600 everytab-key -terraform output ssh_command # prints the ssh command +terraform output ssh_command # SSH to compute instance +terraform output ssh_command_db # SSH to database instance ``` -## 3. Bootstrap EC2 +## 3. Verify Database is Ready ```bash -scp -i everytab-key ec2-userdata.sh ec2-user@:~ -ssh -i everytab-key ec2-user@ 'bash ~/ec2-userdata.sh' +# From your local machine or the compute instance +pg_isready -h $(terraform output -raw db_private_ip) ``` -## 4. Clone Repo on EC2 +If not ready yet, SSH to the DB instance and check `cloud-init` logs: +```bash +tail -f /var/log/cloud-init-output.log +``` + +## 4. Clone Repo + Build on Compute Instance ```bash +ssh -i everytab-key ec2-user@$(terraform output -raw ec2_public_ip) + git clone ~/everytab cd ~/everytab +go build -o ~/warc_parse ./pipeline/02_warc_parse/ +go build -o ~/icon_download ./pipeline/03_icon_download/ +go build -o ~/bundle_gen ./pipeline/05_bundle_gen/ ``` -## 5. Database Instance (i3.large) - -Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck. +## 5. Connect to Database + Apply Schema ```bash -# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG) -# Then SSH in and run: -bash ~/everytab/infra/db-setup.sh -``` - -This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema. - -On the **compute instance** (c5.2xlarge): - -```bash -# Use the private IP printed by db-setup.sh -echo "export DATABASE_URL='postgres://everytab@:5432/everytab'" >> ~/.bashrc -source ~/.bashrc +# Get the connection string +export DATABASE_URL=$(terraform output -raw database_url) +echo "export DATABASE_URL='$DATABASE_URL'" >> ~/.bashrc # Test connectivity psql $DATABASE_URL -c 'SELECT 1;' + +# Apply schema +psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql ``` -Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown. +## 6. Run Pipeline + +See `pipeline/README.md` for the full stage-by-stage guide. ## Pinning the EC2 AMI -The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance. +The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your instances. To prevent this, pin the AMI after initial creation: ```bash -# Get the current instance's AMI +# Get the current AMI aws ec2 describe-instances --filters "Name=tag:Name,Values=everytab" \ --query "Reservations[0].Instances[0].ImageId" --output text @@ -70,21 +86,19 @@ aws ec2 describe-instances --filters "Name=tag:Name,Values=everytab" \ echo 'ec2_ami = "ami-XXXXXXXXXXXX"' >> terraform.tfvars ``` -Now `terraform apply` won't replace the instance for non-EC2 changes (like adding CloudFront logging). - -Remove the `ec2_ami` line from tfvars when you want a fresh instance with the latest AMI (e.g., after teardown). +Remove the `ec2_ami` line from tfvars when you want fresh instances with the latest AMI. ## Teardown (after backup) ```bash -# Back up the database first -pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc +# Back up the database (run from compute instance) +pg_dump $DATABASE_URL -Fc > ~/everytab_dump.pgfc -# Back up icons +# Back up icons to homelab rsync -avP ~/icons/ homelab:/backups/everytab/icons/ ``` -Switch to serving-only mode (destroys EC2, icons bucket): +Switch to serving-only mode (destroys both EC2 instances): ```bash terraform apply -var="scanning=false" @@ -95,3 +109,5 @@ Full destroy (including the live site): ```bash terraform destroy ``` + +**IMPORTANT:** The i3's local NVMe is ephemeral — all data is lost on stop/terminate. Always pg_dump before teardown. diff --git a/infra/db-setup.sh b/infra/db-setup.sh index 0874d28..5341024 100755 --- a/infra/db-setup.sh +++ b/infra/db-setup.sh @@ -122,16 +122,11 @@ echo "--- Creating everytab database ---" sudo -u postgres psql -p 5432 -c "CREATE USER everytab;" 2>/dev/null || true sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true -# --- Apply schema --- -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql" -if [ -f "$SCHEMA" ]; then - echo "--- Applying schema ---" - psql -U everytab -h localhost -d everytab -f "$SCHEMA" -else - echo "Warning: schema.sql not found at $SCHEMA" - echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql" -fi +# --- Schema --- +# Schema is applied from the compute instance over the network: +# psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql +echo "--- Schema ---" +echo "Apply schema from compute instance: psql \$DATABASE_URL -f pipeline/01_cc_index/schema.sql" # --- Validate --- echo "" diff --git a/infra/ec2-userdata.sh b/infra/ec2-userdata.sh index bbf9fa9..56bce25 100755 --- a/infra/ec2-userdata.sh +++ b/infra/ec2-userdata.sh @@ -136,11 +136,41 @@ echo -n "DuckDB: "; duckdb -c "SELECT version();" -noheader -csv echo -n "Unbound: "; dig +short example.com @127.0.0.1 | head -1 echo -n "psql: "; psql --version +echo "" + +# --- Database Connection --- +DB_IP="${db_private_ip}" +export DATABASE_URL="postgres://everytab@$${DB_IP}:5432/everytab" +echo "export DATABASE_URL='postgres://everytab@$${DB_IP}:5432/everytab'" >> /home/ec2-user/.bashrc + +# --- Clone Repo + Build --- +REPO_URL="${repo_url}" +if [ -n "$REPO_URL" ]; then + echo "--- Cloning repo ---" + sudo -u ec2-user git clone "$REPO_URL" /home/ec2-user/everytab + cd /home/ec2-user/everytab + + echo "--- Building Go binaries ---" + sudo -u ec2-user bash -c 'export PATH=$PATH:/usr/local/go/bin && cd ~/everytab && go build -o ~/warc_parse ./pipeline/02_warc_parse/ && go build -o ~/icon_download ./pipeline/03_icon_download/ && go build -o ~/bundle_gen ./pipeline/05_bundle_gen/' + + # Wait for DB to be ready, then apply schema + echo "--- Waiting for database ---" + for i in $(seq 1 60); do + if pg_isready -h "$DB_IP" -q 2>/dev/null; then + echo "Database ready" + sudo -u ec2-user psql "$DATABASE_URL" -f /home/ec2-user/everytab/pipeline/01_cc_index/schema.sql + echo "Schema applied" + break + fi + sleep 5 + done +else + echo "No repo_url set — clone manually" +fi + echo "" echo "=== Bootstrap Complete ===" echo "" -echo "Next: set up your database connection string." -echo " export DATABASE_URL='postgres://everytab:PASSWORD@RDS_ENDPOINT:5432/everytab'" +echo "DATABASE_URL=$DATABASE_URL" echo "" -echo "Test connection:" -echo " psql \$DATABASE_URL -c 'SELECT 1;'" +echo "Ready to run the pipeline. See pipeline/README.md for usage." diff --git a/infra/main.tf b/infra/main.tf index a05f082..1b3d7dd 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -66,6 +66,12 @@ variable "domain" { default = "everytab.site" } +variable "repo_url" { + description = "Git repo URL for the pipeline code (public)" + type = string + default = "" +} + # --- Data sources --- data "aws_ami" "al2023" { @@ -338,6 +344,7 @@ resource "aws_instance" "db" { key_name = aws_key_pair.ec2[0].key_name vpc_security_group_ids = [aws_security_group.db[0].id] subnet_id = var.subnet_ids[0] + user_data = file("${path.module}/db-setup.sh") tags = { Name = "everytab-db" @@ -354,6 +361,10 @@ resource "aws_instance" "main" { vpc_security_group_ids = [aws_security_group.ec2[0].id] subnet_id = var.subnet_ids[0] iam_instance_profile = aws_iam_instance_profile.ec2[0].name + user_data = templatefile("${path.module}/ec2-userdata.sh", { + db_private_ip = aws_instance.db[0].private_ip + repo_url = var.repo_url + }) root_block_device { volume_size = 1000 diff --git a/infra/terraform.tfvars.example b/infra/terraform.tfvars.example index eae77ab..8e7e383 100644 --- a/infra/terraform.tfvars.example +++ b/infra/terraform.tfvars.example @@ -1,10 +1,12 @@ # Copy to terraform.tfvars and fill in your values vpc_id = "vpc-0abc123def456" -subnet_ids = ["subnet-0abc123", "subnet-0def456"] # 2+ subnets in different AZs -db_password = "change-me-to-something-secure" +subnet_ids = ["subnet-0abc123", "subnet-0def456"] # At least 1 subnet ssh_cidr = "203.0.113.50/32" # Your home IP +repo_url = "https://github.com/youruser/everytab.git" # Public repo — enables auto clone+build # Optional overrides: -# region = "us-east-1" -# ec2_instance_type = "c5.xlarge" +# region = "us-east-1" +# ec2_instance_type = "c5.2xlarge" +# db_instance_type = "i3.large" # scanning = true +# ec2_ami = "ami-XXXXXXXXXXXX" # Pin to prevent instance replacement