switched from rds to i5 ec2 for nvme disk read/write speeds
This commit is contained in:
parent
c93d1736fe
commit
bf8b932cdc
4 changed files with 233 additions and 48 deletions
|
|
@ -30,22 +30,31 @@ git clone <your-repo-url> ~/everytab
|
|||
cd ~/everytab
|
||||
```
|
||||
|
||||
## 5. Database Setup
|
||||
## 5. Database Instance (i3.large)
|
||||
|
||||
On the EC2 instance:
|
||||
Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck.
|
||||
|
||||
```bash
|
||||
# Add to .bashrc (get the URL from: terraform output -raw database_url)
|
||||
echo "export DATABASE_URL='postgres://everytab:PASS@ENDPOINT:5432/everytab'" >> ~/.bashrc
|
||||
# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG)
|
||||
# Then SSH in and run:
|
||||
bash ~/everytab/infra/db-setup.sh
|
||||
```
|
||||
|
||||
This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema.
|
||||
|
||||
On the **compute instance** (c5.2xlarge):
|
||||
|
||||
```bash
|
||||
# Use the private IP printed by db-setup.sh
|
||||
echo "export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'" >> ~/.bashrc
|
||||
source ~/.bashrc
|
||||
|
||||
# Test connection
|
||||
# Test connectivity
|
||||
psql $DATABASE_URL -c 'SELECT 1;'
|
||||
|
||||
# Create schema
|
||||
psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql
|
||||
```
|
||||
|
||||
Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown.
|
||||
|
||||
## Pinning the EC2 AMI
|
||||
|
||||
The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance.
|
||||
|
|
@ -67,7 +76,15 @@ Remove the `ec2_ami` line from tfvars when you want a fresh instance with the la
|
|||
|
||||
## Teardown (after backup)
|
||||
|
||||
Switch to serving-only mode (destroys EC2, RDS, icons bucket):
|
||||
```bash
|
||||
# Back up the database first
|
||||
pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc
|
||||
|
||||
# Back up icons
|
||||
rsync -avP ~/icons/ homelab:/backups/everytab/icons/
|
||||
```
|
||||
|
||||
Switch to serving-only mode (destroys EC2, icons bucket):
|
||||
|
||||
```bash
|
||||
terraform apply -var="scanning=false"
|
||||
|
|
|
|||
157
infra/db-setup.sh
Executable file
157
infra/db-setup.sh
Executable file
|
|
@ -0,0 +1,157 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# EveryTab Postgres Setup — for dedicated i3 database instance
|
||||
# Run on the i3.large EC2 instance (475GB local NVMe).
|
||||
# Configures Postgres to use the NVMe for data storage.
|
||||
# Optimized for write-heavy bulk loads — fsync disabled, large buffers.
|
||||
# Data is ephemeral — back up with pg_dump before terminating the instance.
|
||||
|
||||
echo "=== EveryTab Postgres Setup (i3 NVMe) ==="
|
||||
|
||||
# --- Format and mount the NVMe drive ---
|
||||
echo "--- Setting up NVMe storage ---"
|
||||
NVME_DEV="/dev/nvme1n1"
|
||||
NVME_MOUNT="/data"
|
||||
|
||||
if [ ! -d "$NVME_MOUNT" ]; then
|
||||
# Find the NVMe instance store (not the root EBS)
|
||||
# i3.large has one 475GB NVMe at /dev/nvme1n1 or similar
|
||||
if [ ! -b "$NVME_DEV" ]; then
|
||||
# Try finding it
|
||||
NVME_DEV=$(lsblk -dpno NAME,SIZE | grep -v "$(lsblk -dpno NAME /)" | head -1 | awk '{print $1}')
|
||||
if [ -z "$NVME_DEV" ]; then
|
||||
echo "ERROR: Could not find NVMe instance store device"
|
||||
echo "Run 'lsblk' and set NVME_DEV manually"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
echo "Using NVMe device: $NVME_DEV"
|
||||
sudo mkfs.xfs -f "$NVME_DEV"
|
||||
sudo mkdir -p "$NVME_MOUNT"
|
||||
sudo mount "$NVME_DEV" "$NVME_MOUNT"
|
||||
sudo chown ec2-user:ec2-user "$NVME_MOUNT"
|
||||
echo "Mounted $NVME_DEV at $NVME_MOUNT"
|
||||
else
|
||||
echo "NVMe already mounted at $NVME_MOUNT"
|
||||
fi
|
||||
|
||||
# --- Install Postgres ---
|
||||
echo "--- Installing PostgreSQL 16 ---"
|
||||
if ! command -v pg_isready &>/dev/null; then
|
||||
sudo dnf install -y postgresql16-server
|
||||
fi
|
||||
|
||||
# --- Init database on NVMe ---
|
||||
echo "--- Initializing database on NVMe ---"
|
||||
PG_DATA="$NVME_MOUNT/pgdata"
|
||||
if [ ! -d "$PG_DATA" ]; then
|
||||
sudo mkdir -p "$PG_DATA"
|
||||
sudo chown postgres:postgres "$PG_DATA"
|
||||
sudo -u postgres /usr/bin/initdb -D "$PG_DATA"
|
||||
fi
|
||||
|
||||
# --- Configure for pipeline workload ---
|
||||
echo "--- Configuring for bulk load performance ---"
|
||||
PRIVATE_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
sudo tee "$PG_DATA/postgresql.conf" > /dev/null <<EOF
|
||||
# Connection — listen on private IP for compute instance
|
||||
listen_addresses = 'localhost,$PRIVATE_IP'
|
||||
port = 5432
|
||||
max_connections = 100
|
||||
|
||||
# Memory — i3.large has 15.25GB RAM, Postgres gets most of it
|
||||
shared_buffers = 8GB
|
||||
work_mem = 512MB
|
||||
maintenance_work_mem = 2GB
|
||||
effective_cache_size = 12GB
|
||||
|
||||
# Write performance — data is reproducible, maximize speed over durability
|
||||
fsync = off
|
||||
synchronous_commit = off
|
||||
full_page_writes = off
|
||||
|
||||
# WAL — minimal logging since no replication needed
|
||||
wal_level = minimal
|
||||
max_wal_senders = 0
|
||||
max_wal_size = 8GB
|
||||
checkpoint_timeout = 30min
|
||||
checkpoint_completion_target = 0.9
|
||||
|
||||
# Autovacuum
|
||||
autovacuum = on
|
||||
autovacuum_naptime = 60s
|
||||
EOF
|
||||
|
||||
# Allow connections from VPC (10.0.0.0/8 and 172.16.0.0/12 cover most VPC CIDRs)
|
||||
sudo tee "$PG_DATA/pg_hba.conf" > /dev/null <<'EOF'
|
||||
# Local connections
|
||||
local all all trust
|
||||
host all all 127.0.0.1/32 trust
|
||||
host all all ::1/128 trust
|
||||
# VPC connections (from compute instance)
|
||||
host all all 10.0.0.0/8 trust
|
||||
host all all 172.16.0.0/12 trust
|
||||
EOF
|
||||
|
||||
# --- Start with custom data directory ---
|
||||
echo "--- Starting PostgreSQL ---"
|
||||
sudo tee /etc/systemd/system/postgresql-everytab.service > /dev/null <<EOF
|
||||
[Unit]
|
||||
Description=PostgreSQL for EveryTab (NVMe)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
User=postgres
|
||||
ExecStart=/usr/bin/pg_ctl start -D $PG_DATA -l $PG_DATA/pg.log
|
||||
ExecStop=/usr/bin/pg_ctl stop -D $PG_DATA
|
||||
ExecReload=/usr/bin/pg_ctl reload -D $PG_DATA
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable postgresql-everytab
|
||||
sudo systemctl restart postgresql-everytab
|
||||
|
||||
# --- Create database ---
|
||||
echo "--- Creating everytab database ---"
|
||||
sudo -u postgres psql -p 5432 -c "CREATE USER everytab;" 2>/dev/null || true
|
||||
sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true
|
||||
|
||||
# --- Apply schema ---
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql"
|
||||
if [ -f "$SCHEMA" ]; then
|
||||
echo "--- Applying schema ---"
|
||||
psql -U everytab -h localhost -d everytab -f "$SCHEMA"
|
||||
else
|
||||
echo "Warning: schema.sql not found at $SCHEMA"
|
||||
echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql"
|
||||
fi
|
||||
|
||||
# --- Validate ---
|
||||
echo ""
|
||||
echo "=== Validation ==="
|
||||
pg_isready -h localhost
|
||||
psql -U everytab -h localhost -d everytab -c "SELECT 'Postgres OK';" -t -A
|
||||
echo "NVMe disk usage:"
|
||||
df -h "$NVME_MOUNT"
|
||||
|
||||
echo ""
|
||||
echo "=== Setup Complete ==="
|
||||
echo ""
|
||||
echo "Private IP: $PRIVATE_IP"
|
||||
echo ""
|
||||
echo "Connection string (from compute instance):"
|
||||
echo " export DATABASE_URL='postgres://everytab@${PRIVATE_IP}:5432/everytab'"
|
||||
echo ""
|
||||
echo "Connection string (local):"
|
||||
echo " export DATABASE_URL='postgres://everytab@localhost:5432/everytab'"
|
||||
echo ""
|
||||
echo "IMPORTANT: Ensure the compute instance's security group allows"
|
||||
echo "outbound traffic to this instance on port 5432, and this instance's"
|
||||
echo "security group allows inbound on 5432 from the compute instance."
|
||||
|
|
@ -28,13 +28,14 @@ variable "vpc_id" {
|
|||
}
|
||||
|
||||
variable "subnet_ids" {
|
||||
description = "At least 2 subnet IDs in different AZs (required for RDS subnet group)"
|
||||
description = "Subnet IDs — both EC2 instances are placed in subnet_ids[0] (same AZ for low latency)"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "db_password" {
|
||||
description = "Postgres master password"
|
||||
description = "Unused — kept for tfvars compatibility. Local Postgres uses trust auth."
|
||||
type = string
|
||||
default = ""
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
|
|
@ -54,7 +55,7 @@ variable "ec2_ami" {
|
|||
}
|
||||
|
||||
variable "scanning" {
|
||||
description = "Set to true during scanning phase, false for serving-only (tears down EC2, RDS, icons bucket)"
|
||||
description = "Set to true during scanning phase, false for serving-only (tears down EC2 instances)"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
|
@ -116,18 +117,32 @@ resource "aws_security_group" "ec2" {
|
|||
}
|
||||
}
|
||||
|
||||
resource "aws_security_group" "rds" {
|
||||
resource "aws_security_group" "db" {
|
||||
count = var.scanning ? 1 : 0
|
||||
name = "everytab-rds"
|
||||
description = "EveryTab RDS instance"
|
||||
name = "everytab-db"
|
||||
description = "EveryTab DB instance (Postgres on NVMe)"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
ingress {
|
||||
from_port = 22
|
||||
to_port = 22
|
||||
protocol = "tcp"
|
||||
cidr_blocks = [var.ssh_cidr]
|
||||
}
|
||||
|
||||
ingress {
|
||||
from_port = 5432
|
||||
to_port = 5432
|
||||
protocol = "tcp"
|
||||
security_groups = [aws_security_group.ec2[0].id]
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
}
|
||||
|
||||
# --- IAM ---
|
||||
|
|
@ -310,31 +325,23 @@ resource "aws_s3_bucket_policy" "site" {
|
|||
})
|
||||
}
|
||||
|
||||
# --- RDS ---
|
||||
# --- DB Instance (i3.large with local NVMe for Postgres) ---
|
||||
|
||||
resource "aws_db_subnet_group" "main" {
|
||||
count = var.scanning ? 1 : 0
|
||||
name = "everytab"
|
||||
subnet_ids = var.subnet_ids
|
||||
variable "db_instance_type" {
|
||||
default = "i3.large"
|
||||
}
|
||||
|
||||
resource "aws_db_instance" "main" {
|
||||
resource "aws_instance" "db" {
|
||||
count = var.scanning ? 1 : 0
|
||||
identifier = "everytab"
|
||||
engine = "postgres"
|
||||
engine_version = "16"
|
||||
instance_class = "db.t3.medium"
|
||||
allocated_storage = 20
|
||||
storage_type = "gp3"
|
||||
db_name = "everytab"
|
||||
username = "everytab"
|
||||
password = var.db_password
|
||||
db_subnet_group_name = aws_db_subnet_group.main[0].name
|
||||
vpc_security_group_ids = [aws_security_group.rds[0].id]
|
||||
publicly_accessible = false
|
||||
multi_az = false
|
||||
backup_retention_period = 0
|
||||
skip_final_snapshot = true
|
||||
ami = var.ec2_ami != "" ? var.ec2_ami : data.aws_ami.al2023.id
|
||||
instance_type = var.db_instance_type
|
||||
key_name = aws_key_pair.ec2[0].key_name
|
||||
vpc_security_group_ids = [aws_security_group.db[0].id]
|
||||
subnet_id = var.subnet_ids[0]
|
||||
|
||||
tags = {
|
||||
Name = "everytab-db"
|
||||
}
|
||||
}
|
||||
|
||||
# --- EC2 ---
|
||||
|
|
@ -364,13 +371,16 @@ output "ec2_public_ip" {
|
|||
value = var.scanning ? aws_instance.main[0].public_ip : null
|
||||
}
|
||||
|
||||
output "rds_endpoint" {
|
||||
value = var.scanning ? aws_db_instance.main[0].endpoint : null
|
||||
output "db_private_ip" {
|
||||
value = var.scanning ? aws_instance.db[0].private_ip : null
|
||||
}
|
||||
|
||||
output "db_public_ip" {
|
||||
value = var.scanning ? aws_instance.db[0].public_ip : null
|
||||
}
|
||||
|
||||
output "database_url" {
|
||||
value = var.scanning ? "postgres://everytab:${var.db_password}@${aws_db_instance.main[0].endpoint}/everytab" : null
|
||||
sensitive = true
|
||||
value = var.scanning ? "postgres://everytab@${aws_instance.db[0].private_ip}:5432/everytab" : null
|
||||
}
|
||||
|
||||
output "ssh_private_key" {
|
||||
|
|
@ -382,6 +392,10 @@ output "ssh_command" {
|
|||
value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null
|
||||
}
|
||||
|
||||
output "ssh_command_db" {
|
||||
value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.db[0].public_ip}" : null
|
||||
}
|
||||
|
||||
output "cloudfront_domain" {
|
||||
value = aws_cloudfront_distribution.site.domain_name
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,11 +7,8 @@ Between stages, run the sanity checks to confirm data looks right before proceed
|
|||
## Prerequisites
|
||||
|
||||
```bash
|
||||
# Database URL in environment
|
||||
export DATABASE_URL='postgres://everytab:PASS@RDS_ENDPOINT:5432/everytab'
|
||||
|
||||
# Schema created
|
||||
psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql
|
||||
# Postgres on i3 instance (run infra/db-setup.sh on the i3 first)
|
||||
export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
|
||||
|
||||
# Go binaries built on EC2
|
||||
cd ~/everytab
|
||||
|
|
@ -39,10 +36,10 @@ Fetches WARC records from CC's S3, extracts titles, icons, and iframe headers.
|
|||
|
||||
## Stage 3: Icon Download
|
||||
|
||||
Downloads favicons from the live web, validates, downloads to disk.
|
||||
Downloads favicons from the live web, validates, writes to local disk.
|
||||
|
||||
```bash
|
||||
./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir icons/ --log-errors-only
|
||||
GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir ~/icons --log-errors-only
|
||||
```
|
||||
|
||||
## Stage 4: Best Icon Selection
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue