From fcf203e1d81809e9fdfc99fa8438ab982a88e16f Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Sun, 17 May 2026 16:07:50 -0400 Subject: [PATCH] added infra setup with terraform --- .gitignore | 10 ++ PLAN.md | 127 +++++++--------- go.mod | 3 + infra/README.md | 61 ++++++++ infra/ec2-userdata.sh | 124 ++++++++++++++++ infra/main.tf | 253 ++++++++++++++++++++++++++++++++ infra/terraform.tfvars.example | 10 ++ pipeline/01_cc_index/schema.sql | 42 ++++++ 8 files changed, 556 insertions(+), 74 deletions(-) create mode 100644 .gitignore create mode 100644 go.mod create mode 100644 infra/README.md create mode 100755 infra/ec2-userdata.sh create mode 100644 infra/main.tf create mode 100644 infra/terraform.tfvars.example create mode 100644 pipeline/01_cc_index/schema.sql diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c22efac --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +stats/ +*.env +.env.* +infra/everytab-key +infra/everytab-key.pub +infra/.terraform/ +infra/.terraform.lock.hcl +infra/terraform.tfstate +infra/terraform.tfstate.backup +infra/terraform.tfvars diff --git a/PLAN.md b/PLAN.md index b4abc29..2c4d1a7 100644 --- a/PLAN.md +++ b/PLAN.md @@ -6,95 +6,55 @@ Each step has a clear deliverable and validation criteria. Steps are sequential --- -## Phase 0: Project Setup & AWS Infrastructure +## Phase 0: Project Setup & AWS Infrastructure [COMPLETED] -### Step 0.1: Repository Structure - -Create the project layout: +### Step 0.1: Repository Structure [COMPLETED] ``` everytab/ ├── design.md ├── ARCHITECTURE.md ├── PLAN.md -├── infra/ # AWS CLI scripts for setup/teardown -│ ├── setup.sh # Create RDS, S3 buckets, security groups -│ ├── teardown.sh # Delete non-permanent resources -│ └── ec2-userdata.sh # EC2 bootstrap (install Go, DuckDB, Unbound) +├── infra/ +│ ├── main.tf # Terraform: all AWS resources +│ ├── terraform.tfvars.example +│ ├── ec2-userdata.sh # EC2 bootstrap (Go, DuckDB, Unbound) +│ └── README.md # Setup steps ├── pipeline/ -│ ├── 01_cc_index/ # DuckDB query scripts -│ ├── 02_warc_parse/ # Go program -│ ├── 03_icon_download/# Go program -│ ├── 04_best_icon/ # SQL script -│ ├── 05_bundle_gen/ # Go program -│ └── 06_frontend/ # Build script, templates +│ ├── 01_cc_index/ +│ │ └── schema.sql # Postgres table definitions +│ ├── 02_warc_parse/ +│ ├── 03_icon_download/ +│ ├── 04_best_icon/ +│ ├── 05_bundle_gen/ +│ └── 06_frontend/ ├── frontend/ -│ ├── index.html -│ └── site.js -├── stats/ # Stats output from each stage (gitignored) -└── go.mod # Shared Go module for pipeline programs +├── stats/ # gitignored +└── go.mod ``` -**Done when:** Repo structure exists, `go.mod` initialized, `.gitignore` covers stats/ and any local config. +### Step 0.2: AWS Infrastructure (Terraform) [COMPLETED] -### Step 0.2: AWS Infrastructure (Manual CLI) +Infrastructure managed via `infra/main.tf`. Single file, uses `var.scanning` bool to switch phases: +- `terraform apply` — creates all scanning resources (EC2, RDS, S3 icons, S3 site, IAM, security groups) +- `terraform apply -var="scanning=false"` — destroys scanning resources, keeps site bucket +- `terraform destroy` — removes everything -Create resources using AWS CLI commands in `infra/setup.sh`: +Resources created: +- S3 `everytab-icons` (private), S3 `everytab-site` (for CloudFront later) +- RDS Postgres 16, db.t3.medium, 20GB gp3 +- EC2 c5.xlarge, Amazon Linux 2023, 50GB gp3 +- Security groups (SSH from home IP, RDS from EC2 only) +- IAM role + instance profile (S3 access only) +- SSH key (Terraform-managed ed25519) -1. **S3 buckets:** - - `everytab-icons` (private, no public access) - - `everytab-site` (private, accessed via CloudFront OAC) +### Step 0.3: EC2 Environment Setup [COMPLETED] -2. **RDS Postgres:** - - `db.t3.medium`, 20GB storage (expandable), Postgres 16 - - In a VPC, security group allows inbound 5432 from EC2 security group - - No public access (EC2 connects within VPC) - - No multi-AZ (dev, not production) - - Set a strong password, store in a local `.env` (gitignored) - -3. **EC2 instance:** - - `c5.xlarge` (4 vCPU, 8GB RAM) — enough for Go concurrency + Unbound cache - - Amazon Linux 2023 or Ubuntu 24.04 - - Security group: allow SSH (from your IP), allow outbound all - - Same VPC/subnet as RDS - - Key pair for SSH access - -4. **CloudFront distribution:** - - Origin: `everytab-site` S3 bucket (OAC) - - Default cache behavior: cache everything, Brotli+Gzip compression - - Can set up now or defer to Phase 2 - -5. **IAM role for EC2:** - - S3 read/write to both buckets - - Attach as instance profile - -**Validation:** SSH into EC2, confirm `psql` can connect to RDS, confirm `aws s3 ls` shows both buckets. - -**Done when:** All resources exist, EC2 can reach RDS and S3. - -### Step 0.3: EC2 Environment Setup - -Bootstrap script (`infra/ec2-userdata.sh` or run manually): - -1. Install Go (latest stable, 1.22+) -2. Install DuckDB CLI -3. Install Unbound, configure as recursive resolver: - - `/etc/unbound/unbound.conf`: recursive mode, no forwarding, listen on 127.0.0.1 - - High cache: `msg-cache-size: 512m`, `rrset-cache-size: 1g` - - `cache-min-ttl: 3600` - - `prefetch: yes` - - `num-threads: 4` -4. Set `/etc/resolv.conf` → `nameserver 127.0.0.1` -5. Install `psql` client, `pg_dump` -6. Confirm DuckDB httpfs extension works: `INSTALL httpfs; LOAD httpfs;` - -**Validation:** -- `go version` works -- `duckdb -c "INSTALL httpfs; LOAD httpfs; SELECT 1;"` works -- `dig example.com @127.0.0.1` resolves (Unbound working) -- `psql $DATABASE_URL -c "SELECT 1;"` connects to RDS - -**Done when:** EC2 is a working development environment for all pipeline stages. +Bootstrap via `infra/ec2-userdata.sh`: +- Go 1.22+, DuckDB (httpfs + postgres extensions), Unbound (recursive resolver), psql, tmux +- Unbound configured as system resolver (systemd-resolved disabled) +- DATABASE_URL in .bashrc +- Schema applied: hosts + icons tables with indexes --- @@ -715,3 +675,22 @@ On completion, each program prints a summary line and writes its stats JSON (wit - **Postgres connection limits:** RDS db.t3.medium has max_connections ≈ 80. With 1000 goroutines, we need connection pooling (pgx pool handles this). Set pool max to ~40 connections. - **S3 eventual consistency:** After uploading an icon, a HEAD request might not find it immediately. For dedup checks, handle "not found" gracefully (just upload again — idempotent since key is content hash). - **CloudFront caching:** After deploying new bundles, invalidate `/*` or set short TTL during development. For production, use long TTLs (bundles are immutable between crawls). + +--- + +## Progress Log + +### Phase 0 — Completed 2026-05-17 + +**Changes from original plan:** +- Replaced shell scripts (`setup.sh`, `teardown.sh`) with Terraform (`infra/main.tf`). Single file, `var.scanning` bool switches between scanning and serving phases. +- SSH key is Terraform-managed (no passphrase, stored in state) rather than manually generated. +- CloudFront distribution deferred — not created in Phase 0, will add to Terraform when frontend is ready. +- Added `infra/README.md` with terse setup steps for future replication. + +**Lessons learned:** +- Shell scripts with `2>/dev/null || echo "already exists"` swallow real errors. Terraform's declarative model avoids this entirely — errors are always surfaced. +- RDS requires a DB subnet group (2+ subnets in different AZs). The original shell script didn't create one, causing a silent failure. Terraform handles this dependency automatically. +- Amazon Linux 2023 uses `systemd-resolved` which manages `/etc/resolv.conf`. Must disable it before pointing resolv.conf at Unbound. `chattr +i` doesn't work on the symlink. +- AWS EC2 key pairs created via API don't support passphrases. Use `tls_private_key` in Terraform or generate locally with `ssh-keygen` + import. +- When an AWS key pair name already exists from a previous run, Terraform may not regenerate it. Use `-replace` to force recreation of the key + instance together. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..9c09b1e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/joe/everytab + +go 1.25.9 diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..e1e76da --- /dev/null +++ b/infra/README.md @@ -0,0 +1,61 @@ +# Infrastructure Setup + +## 1. Terraform + +```bash +cd infra +cp terraform.tfvars.example terraform.tfvars # fill in your values +terraform init +terraform apply +``` + +## 2. SSH Key + +```bash +terraform output -raw ssh_private_key > everytab-key && chmod 600 everytab-key +terraform output ssh_command # prints the ssh command +``` + +## 3. Bootstrap EC2 + +```bash +scp -i everytab-key ec2-userdata.sh ec2-user@:~ +ssh -i everytab-key ec2-user@ 'bash ~/ec2-userdata.sh' +``` + +## 4. Clone Repo on EC2 + +```bash +git clone ~/everytab +cd ~/everytab +``` + +## 5. Database Setup + +On the EC2 instance: + +```bash +# Add to .bashrc (get the URL from: terraform output -raw database_url) +echo "export DATABASE_URL='postgres://everytab:PASS@ENDPOINT:5432/everytab'" >> ~/.bashrc +source ~/.bashrc + +# Test connection +psql $DATABASE_URL -c 'SELECT 1;' + +# Create schema +psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql +``` + +## Teardown (after backup) + +Switch to serving-only mode (destroys EC2, RDS, icons bucket): + +```bash +terraform apply -var="scanning=false" +``` + +Full destroy (including the live site): + +```bash +terraform destroy +``` diff --git a/infra/ec2-userdata.sh b/infra/ec2-userdata.sh new file mode 100755 index 0000000..510c342 --- /dev/null +++ b/infra/ec2-userdata.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +set -euo pipefail + +# EveryTab EC2 Bootstrap +# Run this on the EC2 instance after first SSH connection. +# Installs: Go, DuckDB, Unbound, psql, pg_dump + +echo "=== EveryTab EC2 Bootstrap ===" + +# --- System packages --- +echo "--- Installing system packages ---" +sudo dnf update -y +sudo dnf install -y \ + gcc \ + git \ + postgresql16 \ + unbound \ + jq \ + htop \ + tmux + +# --- Go --- +echo "--- Installing Go ---" +GO_VERSION="1.22.4" +if ! command -v go &>/dev/null; then + curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xz + echo 'export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin' >> ~/.bashrc + export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin +fi +go version + +# --- DuckDB --- +echo "--- Installing DuckDB ---" +DUCKDB_VERSION="1.1.0" +if ! command -v duckdb &>/dev/null; then + curl -fsSL "https://github.com/duckdb/duckdb/releases/download/v${DUCKDB_VERSION}/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip + cd /tmp && unzip -o duckdb.zip && sudo mv duckdb /usr/local/bin/ && cd - +fi +duckdb -c "SELECT 'DuckDB OK';" + +# Install DuckDB extensions +duckdb -c "INSTALL httpfs; INSTALL postgres;" +echo "DuckDB extensions installed" + +# --- Unbound --- +echo "--- Configuring Unbound ---" +sudo tee /etc/unbound/unbound.conf > /dev/null <<'UNBOUNDCONF' +server: + interface: 127.0.0.1 + port: 53 + access-control: 127.0.0.0/8 allow + + # Performance + num-threads: 4 + msg-cache-slabs: 4 + rrset-cache-slabs: 4 + infra-cache-slabs: 4 + key-cache-slabs: 4 + + # Cache sizing (use available RAM) + msg-cache-size: 512m + rrset-cache-size: 1g + key-cache-size: 256m + + # Aggressive caching + cache-min-ttl: 3600 + cache-max-ttl: 86400 + prefetch: yes + prefetch-key: yes + + # Hardening + hide-identity: yes + hide-version: yes + harden-glue: yes + harden-dnssec-stripped: yes + + # Logging (minimal) + verbosity: 1 + log-queries: no + + # Root hints + root-hints: "/etc/unbound/root.hints" + +remote-control: + control-enable: yes + control-interface: 127.0.0.1 +UNBOUNDCONF + +# Download root hints +sudo curl -fsSL https://www.internic.net/domain/named.root -o /etc/unbound/root.hints + +# Disable systemd-resolved if present (it manages resolv.conf on AL2023) +if systemctl is-active --quiet systemd-resolved 2>/dev/null; then + sudo systemctl disable --now systemd-resolved +fi + +# Set system resolver to use Unbound +sudo rm -f /etc/resolv.conf +echo "nameserver 127.0.0.1" | sudo tee /etc/resolv.conf > /dev/null + +# Start and enable Unbound +sudo systemctl enable unbound +sudo systemctl restart unbound + +# Generate control keys for unbound-control stats +sudo unbound-control-setup 2>/dev/null || true + +echo "" + +# --- Validation --- +echo "=== Validation ===" +echo -n "Go: "; go version +echo -n "DuckDB: "; duckdb -c "SELECT version();" -noheader -csv +echo -n "Unbound: "; dig +short example.com @127.0.0.1 | head -1 +echo -n "psql: "; psql --version + +echo "" +echo "=== Bootstrap Complete ===" +echo "" +echo "Next: set up your database connection string." +echo " export DATABASE_URL='postgres://everytab:PASSWORD@RDS_ENDPOINT:5432/everytab'" +echo "" +echo "Test connection:" +echo " psql \$DATABASE_URL -c 'SELECT 1;'" diff --git a/infra/main.tf b/infra/main.tf new file mode 100644 index 0000000..d368eb8 --- /dev/null +++ b/infra/main.tf @@ -0,0 +1,253 @@ +terraform { + required_version = ">= 1.5" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + tls = { + source = "hashicorp/tls" + version = "~> 4.0" + } + } +} + +provider "aws" { + region = var.region +} + +# --- Variables --- + +variable "region" { + default = "us-east-1" +} + +variable "vpc_id" { + description = "VPC ID to deploy into" + type = string +} + +variable "subnet_ids" { + description = "At least 2 subnet IDs in different AZs (required for RDS subnet group)" + type = list(string) +} + +variable "db_password" { + description = "Postgres master password" + type = string + sensitive = true +} + +variable "ssh_cidr" { + description = "CIDR block for SSH access (e.g., 203.0.113.50/32)" + type = string +} + +variable "ec2_instance_type" { + default = "c5.xlarge" +} + +variable "scanning" { + description = "Set to true during scanning phase, false for serving-only (tears down EC2, RDS, icons bucket)" + type = bool + default = true +} + +# --- Data sources --- + +data "aws_ami" "al2023" { + most_recent = true + owners = ["amazon"] + filter { + name = "name" + values = ["al2023-ami-2023*-x86_64"] + } + filter { + name = "state" + values = ["available"] + } +} + +# --- SSH Key --- + +resource "tls_private_key" "ec2" { + count = var.scanning ? 1 : 0 + algorithm = "ED25519" +} + +resource "aws_key_pair" "ec2" { + count = var.scanning ? 1 : 0 + key_name = "everytab-key" + public_key = tls_private_key.ec2[0].public_key_openssh +} + +# --- Security Groups --- + +resource "aws_security_group" "ec2" { + count = var.scanning ? 1 : 0 + name = "everytab-ec2" + description = "EveryTab EC2 instance" + vpc_id = var.vpc_id + + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = [var.ssh_cidr] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group" "rds" { + count = var.scanning ? 1 : 0 + name = "everytab-rds" + description = "EveryTab RDS instance" + vpc_id = var.vpc_id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.ec2[0].id] + } +} + +# --- IAM --- + +resource "aws_iam_role" "ec2" { + count = var.scanning ? 1 : 0 + name = "everytab-ec2-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "ec2.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) +} + +resource "aws_iam_role_policy" "s3_access" { + count = var.scanning ? 1 : 0 + name = "everytab-s3-access" + role = aws_iam_role.ec2[0].id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = ["s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:ListBucket", "s3:HeadObject"] + Resource = [ + aws_s3_bucket.icons[0].arn, + "${aws_s3_bucket.icons[0].arn}/*", + aws_s3_bucket.site.arn, + "${aws_s3_bucket.site.arn}/*", + ] + }] + }) +} + +resource "aws_iam_instance_profile" "ec2" { + count = var.scanning ? 1 : 0 + name = "everytab-ec2-profile" + role = aws_iam_role.ec2[0].name +} + +# --- S3 --- + +resource "aws_s3_bucket" "icons" { + count = var.scanning ? 1 : 0 + bucket = "everytab-icons" +} + +resource "aws_s3_bucket_public_access_block" "icons" { + count = var.scanning ? 1 : 0 + bucket = aws_s3_bucket.icons[0].id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket" "site" { + bucket = "everytab-site" +} + +# --- RDS --- + +resource "aws_db_subnet_group" "main" { + count = var.scanning ? 1 : 0 + name = "everytab" + subnet_ids = var.subnet_ids +} + +resource "aws_db_instance" "main" { + count = var.scanning ? 1 : 0 + identifier = "everytab" + engine = "postgres" + engine_version = "16" + instance_class = "db.t3.medium" + allocated_storage = 20 + storage_type = "gp3" + db_name = "everytab" + username = "everytab" + password = var.db_password + db_subnet_group_name = aws_db_subnet_group.main[0].name + vpc_security_group_ids = [aws_security_group.rds[0].id] + publicly_accessible = false + multi_az = false + backup_retention_period = 0 + skip_final_snapshot = true +} + +# --- EC2 --- + +resource "aws_instance" "main" { + count = var.scanning ? 1 : 0 + ami = data.aws_ami.al2023.id + instance_type = var.ec2_instance_type + key_name = aws_key_pair.ec2[0].key_name + vpc_security_group_ids = [aws_security_group.ec2[0].id] + subnet_id = var.subnet_ids[0] + iam_instance_profile = aws_iam_instance_profile.ec2[0].name + + root_block_device { + volume_size = 50 + volume_type = "gp3" + } + + tags = { + Name = "everytab" + } +} + +# --- Outputs --- + +output "ec2_public_ip" { + value = var.scanning ? aws_instance.main[0].public_ip : null +} + +output "rds_endpoint" { + value = var.scanning ? aws_db_instance.main[0].endpoint : null +} + +output "database_url" { + value = var.scanning ? "postgres://everytab:${var.db_password}@${aws_db_instance.main[0].endpoint}/everytab" : null + sensitive = true +} + +output "ssh_private_key" { + value = var.scanning ? tls_private_key.ec2[0].private_key_openssh : null + sensitive = true +} + +output "ssh_command" { + value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null +} diff --git a/infra/terraform.tfvars.example b/infra/terraform.tfvars.example new file mode 100644 index 0000000..eae77ab --- /dev/null +++ b/infra/terraform.tfvars.example @@ -0,0 +1,10 @@ +# Copy to terraform.tfvars and fill in your values +vpc_id = "vpc-0abc123def456" +subnet_ids = ["subnet-0abc123", "subnet-0def456"] # 2+ subnets in different AZs +db_password = "change-me-to-something-secure" +ssh_cidr = "203.0.113.50/32" # Your home IP + +# Optional overrides: +# region = "us-east-1" +# ec2_instance_type = "c5.xlarge" +# scanning = true diff --git a/pipeline/01_cc_index/schema.sql b/pipeline/01_cc_index/schema.sql new file mode 100644 index 0000000..e10bd0f --- /dev/null +++ b/pipeline/01_cc_index/schema.sql @@ -0,0 +1,42 @@ +-- EveryTab database schema +-- Run once after RDS is available: +-- psql $DATABASE_URL -f schema.sql + +CREATE TABLE IF NOT EXISTS hosts ( + id SERIAL PRIMARY KEY, + hostname TEXT NOT NULL UNIQUE, + protocol TEXT NOT NULL, + crawl_id TEXT NOT NULL, + warc_filename TEXT NOT NULL, + warc_record_offset BIGINT NOT NULL, + warc_record_length INT NOT NULL, + html_title TEXT, + iframe_allowed BOOLEAN, + best_icon_s3_key TEXT, + parsed BOOLEAN DEFAULT FALSE +); + +CREATE TABLE IF NOT EXISTS icons ( + id SERIAL PRIMARY KEY, + host_id INT NOT NULL REFERENCES hosts(id), + url TEXT NOT NULL, + source TEXT NOT NULL, + rel_type TEXT, + rel_sizes TEXT, + content_type TEXT, + width INT, + height INT, + file_size INT, + s3_key TEXT, + scan_state TEXT DEFAULT 'unscanned', + error TEXT +); + +-- Partial index: only unscanned icons (shrinks as work completes) +CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned'; + +-- For best-icon selection join +CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id); + +-- For WARC parsing cursor +CREATE INDEX IF NOT EXISTS idx_hosts_unparsed ON hosts(id) WHERE parsed = FALSE;