From bf8b932cdccdc677df69a2acb98a72416b32bd85 Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Mon, 25 May 2026 18:17:07 -0400 Subject: [PATCH] switched from rds to i5 ec2 for nvme disk read/write speeds --- infra/README.md | 35 +++++++--- infra/db-setup.sh | 157 +++++++++++++++++++++++++++++++++++++++++++++ infra/main.tf | 78 +++++++++++++--------- pipeline/README.md | 11 ++-- 4 files changed, 233 insertions(+), 48 deletions(-) create mode 100755 infra/db-setup.sh diff --git a/infra/README.md b/infra/README.md index 3aeef60..4c4d02b 100644 --- a/infra/README.md +++ b/infra/README.md @@ -30,22 +30,31 @@ git clone ~/everytab cd ~/everytab ``` -## 5. Database Setup +## 5. Database Instance (i3.large) -On the EC2 instance: +Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck. ```bash -# Add to .bashrc (get the URL from: terraform output -raw database_url) -echo "export DATABASE_URL='postgres://everytab:PASS@ENDPOINT:5432/everytab'" >> ~/.bashrc +# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG) +# Then SSH in and run: +bash ~/everytab/infra/db-setup.sh +``` + +This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema. + +On the **compute instance** (c5.2xlarge): + +```bash +# Use the private IP printed by db-setup.sh +echo "export DATABASE_URL='postgres://everytab@:5432/everytab'" >> ~/.bashrc source ~/.bashrc -# Test connection +# Test connectivity psql $DATABASE_URL -c 'SELECT 1;' - -# Create schema -psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql ``` +Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown. + ## Pinning the EC2 AMI The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance. @@ -67,7 +76,15 @@ Remove the `ec2_ami` line from tfvars when you want a fresh instance with the la ## Teardown (after backup) -Switch to serving-only mode (destroys EC2, RDS, icons bucket): +```bash +# Back up the database first +pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc + +# Back up icons +rsync -avP ~/icons/ homelab:/backups/everytab/icons/ +``` + +Switch to serving-only mode (destroys EC2, icons bucket): ```bash terraform apply -var="scanning=false" diff --git a/infra/db-setup.sh b/infra/db-setup.sh new file mode 100755 index 0000000..0874d28 --- /dev/null +++ b/infra/db-setup.sh @@ -0,0 +1,157 @@ +#!/usr/bin/env bash +set -euo pipefail + +# EveryTab Postgres Setup — for dedicated i3 database instance +# Run on the i3.large EC2 instance (475GB local NVMe). +# Configures Postgres to use the NVMe for data storage. +# Optimized for write-heavy bulk loads — fsync disabled, large buffers. +# Data is ephemeral — back up with pg_dump before terminating the instance. + +echo "=== EveryTab Postgres Setup (i3 NVMe) ===" + +# --- Format and mount the NVMe drive --- +echo "--- Setting up NVMe storage ---" +NVME_DEV="/dev/nvme1n1" +NVME_MOUNT="/data" + +if [ ! -d "$NVME_MOUNT" ]; then + # Find the NVMe instance store (not the root EBS) + # i3.large has one 475GB NVMe at /dev/nvme1n1 or similar + if [ ! -b "$NVME_DEV" ]; then + # Try finding it + NVME_DEV=$(lsblk -dpno NAME,SIZE | grep -v "$(lsblk -dpno NAME /)" | head -1 | awk '{print $1}') + if [ -z "$NVME_DEV" ]; then + echo "ERROR: Could not find NVMe instance store device" + echo "Run 'lsblk' and set NVME_DEV manually" + exit 1 + fi + fi + echo "Using NVMe device: $NVME_DEV" + sudo mkfs.xfs -f "$NVME_DEV" + sudo mkdir -p "$NVME_MOUNT" + sudo mount "$NVME_DEV" "$NVME_MOUNT" + sudo chown ec2-user:ec2-user "$NVME_MOUNT" + echo "Mounted $NVME_DEV at $NVME_MOUNT" +else + echo "NVMe already mounted at $NVME_MOUNT" +fi + +# --- Install Postgres --- +echo "--- Installing PostgreSQL 16 ---" +if ! command -v pg_isready &>/dev/null; then + sudo dnf install -y postgresql16-server +fi + +# --- Init database on NVMe --- +echo "--- Initializing database on NVMe ---" +PG_DATA="$NVME_MOUNT/pgdata" +if [ ! -d "$PG_DATA" ]; then + sudo mkdir -p "$PG_DATA" + sudo chown postgres:postgres "$PG_DATA" + sudo -u postgres /usr/bin/initdb -D "$PG_DATA" +fi + +# --- Configure for pipeline workload --- +echo "--- Configuring for bulk load performance ---" +PRIVATE_IP=$(hostname -I | awk '{print $1}') + +sudo tee "$PG_DATA/postgresql.conf" > /dev/null < /dev/null <<'EOF' +# Local connections +local all all trust +host all all 127.0.0.1/32 trust +host all all ::1/128 trust +# VPC connections (from compute instance) +host all all 10.0.0.0/8 trust +host all all 172.16.0.0/12 trust +EOF + +# --- Start with custom data directory --- +echo "--- Starting PostgreSQL ---" +sudo tee /etc/systemd/system/postgresql-everytab.service > /dev/null </dev/null || true +sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true + +# --- Apply schema --- +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql" +if [ -f "$SCHEMA" ]; then + echo "--- Applying schema ---" + psql -U everytab -h localhost -d everytab -f "$SCHEMA" +else + echo "Warning: schema.sql not found at $SCHEMA" + echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql" +fi + +# --- Validate --- +echo "" +echo "=== Validation ===" +pg_isready -h localhost +psql -U everytab -h localhost -d everytab -c "SELECT 'Postgres OK';" -t -A +echo "NVMe disk usage:" +df -h "$NVME_MOUNT" + +echo "" +echo "=== Setup Complete ===" +echo "" +echo "Private IP: $PRIVATE_IP" +echo "" +echo "Connection string (from compute instance):" +echo " export DATABASE_URL='postgres://everytab@${PRIVATE_IP}:5432/everytab'" +echo "" +echo "Connection string (local):" +echo " export DATABASE_URL='postgres://everytab@localhost:5432/everytab'" +echo "" +echo "IMPORTANT: Ensure the compute instance's security group allows" +echo "outbound traffic to this instance on port 5432, and this instance's" +echo "security group allows inbound on 5432 from the compute instance." diff --git a/infra/main.tf b/infra/main.tf index ac543d4..a05f082 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -28,13 +28,14 @@ variable "vpc_id" { } variable "subnet_ids" { - description = "At least 2 subnet IDs in different AZs (required for RDS subnet group)" + description = "Subnet IDs — both EC2 instances are placed in subnet_ids[0] (same AZ for low latency)" type = list(string) } variable "db_password" { - description = "Postgres master password" + description = "Unused — kept for tfvars compatibility. Local Postgres uses trust auth." type = string + default = "" sensitive = true } @@ -54,7 +55,7 @@ variable "ec2_ami" { } variable "scanning" { - description = "Set to true during scanning phase, false for serving-only (tears down EC2, RDS, icons bucket)" + description = "Set to true during scanning phase, false for serving-only (tears down EC2 instances)" type = bool default = true } @@ -116,18 +117,32 @@ resource "aws_security_group" "ec2" { } } -resource "aws_security_group" "rds" { +resource "aws_security_group" "db" { count = var.scanning ? 1 : 0 - name = "everytab-rds" - description = "EveryTab RDS instance" + name = "everytab-db" + description = "EveryTab DB instance (Postgres on NVMe)" vpc_id = var.vpc_id + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = [var.ssh_cidr] + } + ingress { from_port = 5432 to_port = 5432 protocol = "tcp" security_groups = [aws_security_group.ec2[0].id] } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } } # --- IAM --- @@ -310,31 +325,23 @@ resource "aws_s3_bucket_policy" "site" { }) } -# --- RDS --- +# --- DB Instance (i3.large with local NVMe for Postgres) --- -resource "aws_db_subnet_group" "main" { - count = var.scanning ? 1 : 0 - name = "everytab" - subnet_ids = var.subnet_ids +variable "db_instance_type" { + default = "i3.large" } -resource "aws_db_instance" "main" { - count = var.scanning ? 1 : 0 - identifier = "everytab" - engine = "postgres" - engine_version = "16" - instance_class = "db.t3.medium" - allocated_storage = 20 - storage_type = "gp3" - db_name = "everytab" - username = "everytab" - password = var.db_password - db_subnet_group_name = aws_db_subnet_group.main[0].name - vpc_security_group_ids = [aws_security_group.rds[0].id] - publicly_accessible = false - multi_az = false - backup_retention_period = 0 - skip_final_snapshot = true +resource "aws_instance" "db" { + count = var.scanning ? 1 : 0 + ami = var.ec2_ami != "" ? var.ec2_ami : data.aws_ami.al2023.id + instance_type = var.db_instance_type + key_name = aws_key_pair.ec2[0].key_name + vpc_security_group_ids = [aws_security_group.db[0].id] + subnet_id = var.subnet_ids[0] + + tags = { + Name = "everytab-db" + } } # --- EC2 --- @@ -364,13 +371,16 @@ output "ec2_public_ip" { value = var.scanning ? aws_instance.main[0].public_ip : null } -output "rds_endpoint" { - value = var.scanning ? aws_db_instance.main[0].endpoint : null +output "db_private_ip" { + value = var.scanning ? aws_instance.db[0].private_ip : null +} + +output "db_public_ip" { + value = var.scanning ? aws_instance.db[0].public_ip : null } output "database_url" { - value = var.scanning ? "postgres://everytab:${var.db_password}@${aws_db_instance.main[0].endpoint}/everytab" : null - sensitive = true + value = var.scanning ? "postgres://everytab@${aws_instance.db[0].private_ip}:5432/everytab" : null } output "ssh_private_key" { @@ -382,6 +392,10 @@ output "ssh_command" { value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null } +output "ssh_command_db" { + value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.db[0].public_ip}" : null +} + output "cloudfront_domain" { value = aws_cloudfront_distribution.site.domain_name } diff --git a/pipeline/README.md b/pipeline/README.md index 354a0fd..fa0e43a 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -7,11 +7,8 @@ Between stages, run the sanity checks to confirm data looks right before proceed ## Prerequisites ```bash -# Database URL in environment -export DATABASE_URL='postgres://everytab:PASS@RDS_ENDPOINT:5432/everytab' - -# Schema created -psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql +# Postgres on i3 instance (run infra/db-setup.sh on the i3 first) +export DATABASE_URL='postgres://everytab@:5432/everytab' # Go binaries built on EC2 cd ~/everytab @@ -39,10 +36,10 @@ Fetches WARC records from CC's S3, extracts titles, icons, and iframe headers. ## Stage 3: Icon Download -Downloads favicons from the live web, validates, downloads to disk. +Downloads favicons from the live web, validates, writes to local disk. ```bash -./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir icons/ --log-errors-only +GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir ~/icons --log-errors-only ``` ## Stage 4: Best Icon Selection