From bf8b932cdccdc677df69a2acb98a72416b32bd85 Mon Sep 17 00:00:00 2001
From: Joe Lothan <joe@lothan.net>
Date: Mon, 25 May 2026 18:17:07 -0400
Subject: [PATCH] switched from rds to i5 ec2 for nvme disk read/write speeds

---
 infra/README.md    |  35 +++++++---
 infra/db-setup.sh  | 157 +++++++++++++++++++++++++++++++++++++++++++++
 infra/main.tf      |  78 +++++++++++++---------
 pipeline/README.md |  11 ++--
 4 files changed, 233 insertions(+), 48 deletions(-)
 create mode 100755 infra/db-setup.sh
diff --git a/infra/README.md b/infra/README.md
index 3aeef60..4c4d02b 100644
--- a/infra/README.md
+++ b/infra/README.md
@@ -30,22 +30,31 @@ git clone <your-repo-url> ~/everytab
 cd ~/everytab
 ```
 
-## 5. Database Setup
+## 5. Database Instance (i3.large)
 
-On the EC2 instance:
+Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck.
 
 ```bash
-# Add to .bashrc (get the URL from: terraform output -raw database_url)
-echo "export DATABASE_URL='postgres://everytab:PASS@ENDPOINT:5432/everytab'" >> ~/.bashrc
+# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG)
+# Then SSH in and run:
+bash ~/everytab/infra/db-setup.sh
+```
+
+This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema.
+
+On the **compute instance** (c5.2xlarge):
+
+```bash
+# Use the private IP printed by db-setup.sh
+echo "export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'" >> ~/.bashrc
 source ~/.bashrc
 
-# Test connection
+# Test connectivity
 psql $DATABASE_URL -c 'SELECT 1;'
-
-# Create schema
-psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql
 ```
 
+Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown.
+
 ## Pinning the EC2 AMI
 
 The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance.
@@ -67,7 +76,15 @@ Remove the `ec2_ami` line from tfvars when you want a fresh instance with the la
 
 ## Teardown (after backup)
 
-Switch to serving-only mode (destroys EC2, RDS, icons bucket):
+```bash
+# Back up the database first
+pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc
+
+# Back up icons
+rsync -avP ~/icons/ homelab:/backups/everytab/icons/
+```
+
+Switch to serving-only mode (destroys EC2, icons bucket):
 
 ```bash
 terraform apply -var="scanning=false"
diff --git a/infra/db-setup.sh b/infra/db-setup.sh
new file mode 100755
index 0000000..0874d28
--- /dev/null
+++ b/infra/db-setup.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# EveryTab Postgres Setup — for dedicated i3 database instance
+# Run on the i3.large EC2 instance (475GB local NVMe).
+# Configures Postgres to use the NVMe for data storage.
+# Optimized for write-heavy bulk loads — fsync disabled, large buffers.
+# Data is ephemeral — back up with pg_dump before terminating the instance.
+
+echo "=== EveryTab Postgres Setup (i3 NVMe) ==="
+
+# --- Format and mount the NVMe drive ---
+echo "--- Setting up NVMe storage ---"
+NVME_DEV="/dev/nvme1n1"
+NVME_MOUNT="/data"
+
+if [ ! -d "$NVME_MOUNT" ]; then
+    # Find the NVMe instance store (not the root EBS)
+    # i3.large has one 475GB NVMe at /dev/nvme1n1 or similar
+    if [ ! -b "$NVME_DEV" ]; then
+        # Try finding it
+        NVME_DEV=$(lsblk -dpno NAME,SIZE | grep -v "$(lsblk -dpno NAME /)" | head -1 | awk '{print $1}')
+        if [ -z "$NVME_DEV" ]; then
+            echo "ERROR: Could not find NVMe instance store device"
+            echo "Run 'lsblk' and set NVME_DEV manually"
+            exit 1
+        fi
+    fi
+    echo "Using NVMe device: $NVME_DEV"
+    sudo mkfs.xfs -f "$NVME_DEV"
+    sudo mkdir -p "$NVME_MOUNT"
+    sudo mount "$NVME_DEV" "$NVME_MOUNT"
+    sudo chown ec2-user:ec2-user "$NVME_MOUNT"
+    echo "Mounted $NVME_DEV at $NVME_MOUNT"
+else
+    echo "NVMe already mounted at $NVME_MOUNT"
+fi
+
+# --- Install Postgres ---
+echo "--- Installing PostgreSQL 16 ---"
+if ! command -v pg_isready &>/dev/null; then
+    sudo dnf install -y postgresql16-server
+fi
+
+# --- Init database on NVMe ---
+echo "--- Initializing database on NVMe ---"
+PG_DATA="$NVME_MOUNT/pgdata"
+if [ ! -d "$PG_DATA" ]; then
+    sudo mkdir -p "$PG_DATA"
+    sudo chown postgres:postgres "$PG_DATA"
+    sudo -u postgres /usr/bin/initdb -D "$PG_DATA"
+fi
+
+# --- Configure for pipeline workload ---
+echo "--- Configuring for bulk load performance ---"
+PRIVATE_IP=$(hostname -I | awk '{print $1}')
+
+sudo tee "$PG_DATA/postgresql.conf" > /dev/null <<EOF
+# Connection — listen on private IP for compute instance
+listen_addresses = 'localhost,$PRIVATE_IP'
+port = 5432
+max_connections = 100
+
+# Memory — i3.large has 15.25GB RAM, Postgres gets most of it
+shared_buffers = 8GB
+work_mem = 512MB
+maintenance_work_mem = 2GB
+effective_cache_size = 12GB
+
+# Write performance — data is reproducible, maximize speed over durability
+fsync = off
+synchronous_commit = off
+full_page_writes = off
+
+# WAL — minimal logging since no replication needed
+wal_level = minimal
+max_wal_senders = 0
+max_wal_size = 8GB
+checkpoint_timeout = 30min
+checkpoint_completion_target = 0.9
+
+# Autovacuum
+autovacuum = on
+autovacuum_naptime = 60s
+EOF
+
+# Allow connections from VPC (10.0.0.0/8 and 172.16.0.0/12 cover most VPC CIDRs)
+sudo tee "$PG_DATA/pg_hba.conf" > /dev/null <<'EOF'
+# Local connections
+local   all   all                 trust
+host    all   all   127.0.0.1/32  trust
+host    all   all   ::1/128       trust
+# VPC connections (from compute instance)
+host    all   all   10.0.0.0/8    trust
+host    all   all   172.16.0.0/12 trust
+EOF
+
+# --- Start with custom data directory ---
+echo "--- Starting PostgreSQL ---"
+sudo tee /etc/systemd/system/postgresql-everytab.service > /dev/null <<EOF
+[Unit]
+Description=PostgreSQL for EveryTab (NVMe)
+After=network.target
+
+[Service]
+Type=forking
+User=postgres
+ExecStart=/usr/bin/pg_ctl start -D $PG_DATA -l $PG_DATA/pg.log
+ExecStop=/usr/bin/pg_ctl stop -D $PG_DATA
+ExecReload=/usr/bin/pg_ctl reload -D $PG_DATA
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+sudo systemctl daemon-reload
+sudo systemctl enable postgresql-everytab
+sudo systemctl restart postgresql-everytab
+
+# --- Create database ---
+echo "--- Creating everytab database ---"
+sudo -u postgres psql -p 5432 -c "CREATE USER everytab;" 2>/dev/null || true
+sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true
+
+# --- Apply schema ---
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql"
+if [ -f "$SCHEMA" ]; then
+    echo "--- Applying schema ---"
+    psql -U everytab -h localhost -d everytab -f "$SCHEMA"
+else
+    echo "Warning: schema.sql not found at $SCHEMA"
+    echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql"
+fi
+
+# --- Validate ---
+echo ""
+echo "=== Validation ==="
+pg_isready -h localhost
+psql -U everytab -h localhost -d everytab -c "SELECT 'Postgres OK';" -t -A
+echo "NVMe disk usage:"
+df -h "$NVME_MOUNT"
+
+echo ""
+echo "=== Setup Complete ==="
+echo ""
+echo "Private IP: $PRIVATE_IP"
+echo ""
+echo "Connection string (from compute instance):"
+echo "  export DATABASE_URL='postgres://everytab@${PRIVATE_IP}:5432/everytab'"
+echo ""
+echo "Connection string (local):"
+echo "  export DATABASE_URL='postgres://everytab@localhost:5432/everytab'"
+echo ""
+echo "IMPORTANT: Ensure the compute instance's security group allows"
+echo "outbound traffic to this instance on port 5432, and this instance's"
+echo "security group allows inbound on 5432 from the compute instance."
diff --git a/infra/main.tf b/infra/main.tf
index ac543d4..a05f082 100644
--- a/infra/main.tf
+++ b/infra/main.tf
@@ -28,13 +28,14 @@ variable "vpc_id" {
 }
 
 variable "subnet_ids" {
-  description = "At least 2 subnet IDs in different AZs (required for RDS subnet group)"
+  description = "Subnet IDs — both EC2 instances are placed in subnet_ids[0] (same AZ for low latency)"
   type        = list(string)
 }
 
 variable "db_password" {
-  description = "Postgres master password"
+  description = "Unused — kept for tfvars compatibility. Local Postgres uses trust auth."
   type        = string
+  default     = ""
   sensitive   = true
 }
 
@@ -54,7 +55,7 @@ variable "ec2_ami" {
 }
 
 variable "scanning" {
-  description = "Set to true during scanning phase, false for serving-only (tears down EC2, RDS, icons bucket)"
+  description = "Set to true during scanning phase, false for serving-only (tears down EC2 instances)"
   type        = bool
   default     = true
 }
@@ -116,18 +117,32 @@ resource "aws_security_group" "ec2" {
   }
 }
 
-resource "aws_security_group" "rds" {
+resource "aws_security_group" "db" {
   count       = var.scanning ? 1 : 0
-  name        = "everytab-rds"
-  description = "EveryTab RDS instance"
+  name        = "everytab-db"
+  description = "EveryTab DB instance (Postgres on NVMe)"
   vpc_id      = var.vpc_id
 
+  ingress {
+    from_port       = 22
+    to_port         = 22
+    protocol        = "tcp"
+    cidr_blocks     = [var.ssh_cidr]
+  }
+
   ingress {
     from_port       = 5432
     to_port         = 5432
     protocol        = "tcp"
     security_groups = [aws_security_group.ec2[0].id]
   }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
 }
 
 # --- IAM ---
@@ -310,31 +325,23 @@ resource "aws_s3_bucket_policy" "site" {
   })
 }
 
-# --- RDS ---
+# --- DB Instance (i3.large with local NVMe for Postgres) ---
 
-resource "aws_db_subnet_group" "main" {
-  count      = var.scanning ? 1 : 0
-  name       = "everytab"
-  subnet_ids = var.subnet_ids
+variable "db_instance_type" {
+  default = "i3.large"
 }
 
-resource "aws_db_instance" "main" {
-  count                  = var.scanning ? 1 : 0
-  identifier             = "everytab"
-  engine                 = "postgres"
-  engine_version         = "16"
-  instance_class         = "db.t3.medium"
-  allocated_storage      = 20
-  storage_type           = "gp3"
-  db_name                = "everytab"
-  username               = "everytab"
-  password               = var.db_password
-  db_subnet_group_name   = aws_db_subnet_group.main[0].name
-  vpc_security_group_ids = [aws_security_group.rds[0].id]
-  publicly_accessible    = false
-  multi_az               = false
-  backup_retention_period = 0
-  skip_final_snapshot    = true
+resource "aws_instance" "db" {
+  count                = var.scanning ? 1 : 0
+  ami                  = var.ec2_ami != "" ? var.ec2_ami : data.aws_ami.al2023.id
+  instance_type        = var.db_instance_type
+  key_name             = aws_key_pair.ec2[0].key_name
+  vpc_security_group_ids = [aws_security_group.db[0].id]
+  subnet_id            = var.subnet_ids[0]
+
+  tags = {
+    Name = "everytab-db"
+  }
 }
 
 # --- EC2 ---
@@ -364,13 +371,16 @@ output "ec2_public_ip" {
   value = var.scanning ? aws_instance.main[0].public_ip : null
 }
 
-output "rds_endpoint" {
-  value = var.scanning ? aws_db_instance.main[0].endpoint : null
+output "db_private_ip" {
+  value = var.scanning ? aws_instance.db[0].private_ip : null
+}
+
+output "db_public_ip" {
+  value = var.scanning ? aws_instance.db[0].public_ip : null
 }
 
 output "database_url" {
-  value     = var.scanning ? "postgres://everytab:${var.db_password}@${aws_db_instance.main[0].endpoint}/everytab" : null
-  sensitive = true
+  value = var.scanning ? "postgres://everytab@${aws_instance.db[0].private_ip}:5432/everytab" : null
 }
 
 output "ssh_private_key" {
@@ -382,6 +392,10 @@ output "ssh_command" {
   value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null
 }
 
+output "ssh_command_db" {
+  value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.db[0].public_ip}" : null
+}
+
 output "cloudfront_domain" {
   value = aws_cloudfront_distribution.site.domain_name
 }
diff --git a/pipeline/README.md b/pipeline/README.md
index 354a0fd..fa0e43a 100644
--- a/pipeline/README.md
+++ b/pipeline/README.md
@@ -7,11 +7,8 @@ Between stages, run the sanity checks to confirm data looks right before proceed
 ## Prerequisites
 
 ```bash
-# Database URL in environment
-export DATABASE_URL='postgres://everytab:PASS@RDS_ENDPOINT:5432/everytab'
-
-# Schema created
-psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql
+# Postgres on i3 instance (run infra/db-setup.sh on the i3 first)
+export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
 
 # Go binaries built on EC2
 cd ~/everytab
@@ -39,10 +36,10 @@ Fetches WARC records from CC's S3, extracts titles, icons, and iframe headers.
 
 ## Stage 3: Icon Download
 
-Downloads favicons from the live web, validates, downloads to disk.
+Downloads favicons from the live web, validates, writes to local disk.
 
 ```bash
-./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir icons/ --log-errors-only
+GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir ~/icons --log-errors-only
 ```
 
 ## Stage 4: Best Icon Selection