switched from rds to i5 ec2 for nvme disk read/write speeds
This commit is contained in:
parent
c93d1736fe
commit
bf8b932cdc
4 changed files with 233 additions and 48 deletions
|
|
@ -30,22 +30,31 @@ git clone <your-repo-url> ~/everytab
|
||||||
cd ~/everytab
|
cd ~/everytab
|
||||||
```
|
```
|
||||||
|
|
||||||
## 5. Database Setup
|
## 5. Database Instance (i3.large)
|
||||||
|
|
||||||
On the EC2 instance:
|
Spin up an i3.large in the same AZ as the compute instance. This provides 475GB local NVMe with 100K+ IOPS for Postgres — eliminates the EBS/RDS IOPS bottleneck.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Add to .bashrc (get the URL from: terraform output -raw database_url)
|
# Launch i3.large (same subnet/AZ, same key pair, allow port 5432 from compute SG)
|
||||||
echo "export DATABASE_URL='postgres://everytab:PASS@ENDPOINT:5432/everytab'" >> ~/.bashrc
|
# Then SSH in and run:
|
||||||
|
bash ~/everytab/infra/db-setup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
This formats the NVMe, installs Postgres on it with aggressive write settings (`fsync=off`), creates the database, and applies the schema.
|
||||||
|
|
||||||
|
On the **compute instance** (c5.2xlarge):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use the private IP printed by db-setup.sh
|
||||||
|
echo "export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'" >> ~/.bashrc
|
||||||
source ~/.bashrc
|
source ~/.bashrc
|
||||||
|
|
||||||
# Test connection
|
# Test connectivity
|
||||||
psql $DATABASE_URL -c 'SELECT 1;'
|
psql $DATABASE_URL -c 'SELECT 1;'
|
||||||
|
|
||||||
# Create schema
|
|
||||||
psql $DATABASE_URL -f ~/everytab/pipeline/01_cc_index/schema.sql
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note: the i3's local NVMe is ephemeral — data is lost on stop/terminate. Always `pg_dump` before teardown.
|
||||||
|
|
||||||
## Pinning the EC2 AMI
|
## Pinning the EC2 AMI
|
||||||
|
|
||||||
The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance.
|
The `data.aws_ami` lookup fetches the latest Amazon Linux 2023 AMI. If Amazon publishes a new one between applies, Terraform will want to replace your EC2 instance.
|
||||||
|
|
@ -67,7 +76,15 @@ Remove the `ec2_ami` line from tfvars when you want a fresh instance with the la
|
||||||
|
|
||||||
## Teardown (after backup)
|
## Teardown (after backup)
|
||||||
|
|
||||||
Switch to serving-only mode (destroys EC2, RDS, icons bucket):
|
```bash
|
||||||
|
# Back up the database first
|
||||||
|
pg_dump -U everytab -Fc everytab > ~/everytab_dump.pgfc
|
||||||
|
|
||||||
|
# Back up icons
|
||||||
|
rsync -avP ~/icons/ homelab:/backups/everytab/icons/
|
||||||
|
```
|
||||||
|
|
||||||
|
Switch to serving-only mode (destroys EC2, icons bucket):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
terraform apply -var="scanning=false"
|
terraform apply -var="scanning=false"
|
||||||
|
|
|
||||||
157
infra/db-setup.sh
Executable file
157
infra/db-setup.sh
Executable file
|
|
@ -0,0 +1,157 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# EveryTab Postgres Setup — for dedicated i3 database instance
|
||||||
|
# Run on the i3.large EC2 instance (475GB local NVMe).
|
||||||
|
# Configures Postgres to use the NVMe for data storage.
|
||||||
|
# Optimized for write-heavy bulk loads — fsync disabled, large buffers.
|
||||||
|
# Data is ephemeral — back up with pg_dump before terminating the instance.
|
||||||
|
|
||||||
|
echo "=== EveryTab Postgres Setup (i3 NVMe) ==="
|
||||||
|
|
||||||
|
# --- Format and mount the NVMe drive ---
|
||||||
|
echo "--- Setting up NVMe storage ---"
|
||||||
|
NVME_DEV="/dev/nvme1n1"
|
||||||
|
NVME_MOUNT="/data"
|
||||||
|
|
||||||
|
if [ ! -d "$NVME_MOUNT" ]; then
|
||||||
|
# Find the NVMe instance store (not the root EBS)
|
||||||
|
# i3.large has one 475GB NVMe at /dev/nvme1n1 or similar
|
||||||
|
if [ ! -b "$NVME_DEV" ]; then
|
||||||
|
# Try finding it
|
||||||
|
NVME_DEV=$(lsblk -dpno NAME,SIZE | grep -v "$(lsblk -dpno NAME /)" | head -1 | awk '{print $1}')
|
||||||
|
if [ -z "$NVME_DEV" ]; then
|
||||||
|
echo "ERROR: Could not find NVMe instance store device"
|
||||||
|
echo "Run 'lsblk' and set NVME_DEV manually"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo "Using NVMe device: $NVME_DEV"
|
||||||
|
sudo mkfs.xfs -f "$NVME_DEV"
|
||||||
|
sudo mkdir -p "$NVME_MOUNT"
|
||||||
|
sudo mount "$NVME_DEV" "$NVME_MOUNT"
|
||||||
|
sudo chown ec2-user:ec2-user "$NVME_MOUNT"
|
||||||
|
echo "Mounted $NVME_DEV at $NVME_MOUNT"
|
||||||
|
else
|
||||||
|
echo "NVMe already mounted at $NVME_MOUNT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Install Postgres ---
|
||||||
|
echo "--- Installing PostgreSQL 16 ---"
|
||||||
|
if ! command -v pg_isready &>/dev/null; then
|
||||||
|
sudo dnf install -y postgresql16-server
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Init database on NVMe ---
|
||||||
|
echo "--- Initializing database on NVMe ---"
|
||||||
|
PG_DATA="$NVME_MOUNT/pgdata"
|
||||||
|
if [ ! -d "$PG_DATA" ]; then
|
||||||
|
sudo mkdir -p "$PG_DATA"
|
||||||
|
sudo chown postgres:postgres "$PG_DATA"
|
||||||
|
sudo -u postgres /usr/bin/initdb -D "$PG_DATA"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Configure for pipeline workload ---
|
||||||
|
echo "--- Configuring for bulk load performance ---"
|
||||||
|
PRIVATE_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
|
||||||
|
sudo tee "$PG_DATA/postgresql.conf" > /dev/null <<EOF
|
||||||
|
# Connection — listen on private IP for compute instance
|
||||||
|
listen_addresses = 'localhost,$PRIVATE_IP'
|
||||||
|
port = 5432
|
||||||
|
max_connections = 100
|
||||||
|
|
||||||
|
# Memory — i3.large has 15.25GB RAM, Postgres gets most of it
|
||||||
|
shared_buffers = 8GB
|
||||||
|
work_mem = 512MB
|
||||||
|
maintenance_work_mem = 2GB
|
||||||
|
effective_cache_size = 12GB
|
||||||
|
|
||||||
|
# Write performance — data is reproducible, maximize speed over durability
|
||||||
|
fsync = off
|
||||||
|
synchronous_commit = off
|
||||||
|
full_page_writes = off
|
||||||
|
|
||||||
|
# WAL — minimal logging since no replication needed
|
||||||
|
wal_level = minimal
|
||||||
|
max_wal_senders = 0
|
||||||
|
max_wal_size = 8GB
|
||||||
|
checkpoint_timeout = 30min
|
||||||
|
checkpoint_completion_target = 0.9
|
||||||
|
|
||||||
|
# Autovacuum
|
||||||
|
autovacuum = on
|
||||||
|
autovacuum_naptime = 60s
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Allow connections from VPC (10.0.0.0/8 and 172.16.0.0/12 cover most VPC CIDRs)
|
||||||
|
sudo tee "$PG_DATA/pg_hba.conf" > /dev/null <<'EOF'
|
||||||
|
# Local connections
|
||||||
|
local all all trust
|
||||||
|
host all all 127.0.0.1/32 trust
|
||||||
|
host all all ::1/128 trust
|
||||||
|
# VPC connections (from compute instance)
|
||||||
|
host all all 10.0.0.0/8 trust
|
||||||
|
host all all 172.16.0.0/12 trust
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# --- Start with custom data directory ---
|
||||||
|
echo "--- Starting PostgreSQL ---"
|
||||||
|
sudo tee /etc/systemd/system/postgresql-everytab.service > /dev/null <<EOF
|
||||||
|
[Unit]
|
||||||
|
Description=PostgreSQL for EveryTab (NVMe)
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=forking
|
||||||
|
User=postgres
|
||||||
|
ExecStart=/usr/bin/pg_ctl start -D $PG_DATA -l $PG_DATA/pg.log
|
||||||
|
ExecStop=/usr/bin/pg_ctl stop -D $PG_DATA
|
||||||
|
ExecReload=/usr/bin/pg_ctl reload -D $PG_DATA
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable postgresql-everytab
|
||||||
|
sudo systemctl restart postgresql-everytab
|
||||||
|
|
||||||
|
# --- Create database ---
|
||||||
|
echo "--- Creating everytab database ---"
|
||||||
|
sudo -u postgres psql -p 5432 -c "CREATE USER everytab;" 2>/dev/null || true
|
||||||
|
sudo -u postgres psql -p 5432 -c "CREATE DATABASE everytab OWNER everytab;" 2>/dev/null || true
|
||||||
|
|
||||||
|
# --- Apply schema ---
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
SCHEMA="$SCRIPT_DIR/../pipeline/01_cc_index/schema.sql"
|
||||||
|
if [ -f "$SCHEMA" ]; then
|
||||||
|
echo "--- Applying schema ---"
|
||||||
|
psql -U everytab -h localhost -d everytab -f "$SCHEMA"
|
||||||
|
else
|
||||||
|
echo "Warning: schema.sql not found at $SCHEMA"
|
||||||
|
echo "Copy it over and run: psql -U everytab -h localhost -d everytab -f schema.sql"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Validate ---
|
||||||
|
echo ""
|
||||||
|
echo "=== Validation ==="
|
||||||
|
pg_isready -h localhost
|
||||||
|
psql -U everytab -h localhost -d everytab -c "SELECT 'Postgres OK';" -t -A
|
||||||
|
echo "NVMe disk usage:"
|
||||||
|
df -h "$NVME_MOUNT"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Setup Complete ==="
|
||||||
|
echo ""
|
||||||
|
echo "Private IP: $PRIVATE_IP"
|
||||||
|
echo ""
|
||||||
|
echo "Connection string (from compute instance):"
|
||||||
|
echo " export DATABASE_URL='postgres://everytab@${PRIVATE_IP}:5432/everytab'"
|
||||||
|
echo ""
|
||||||
|
echo "Connection string (local):"
|
||||||
|
echo " export DATABASE_URL='postgres://everytab@localhost:5432/everytab'"
|
||||||
|
echo ""
|
||||||
|
echo "IMPORTANT: Ensure the compute instance's security group allows"
|
||||||
|
echo "outbound traffic to this instance on port 5432, and this instance's"
|
||||||
|
echo "security group allows inbound on 5432 from the compute instance."
|
||||||
|
|
@ -28,13 +28,14 @@ variable "vpc_id" {
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "subnet_ids" {
|
variable "subnet_ids" {
|
||||||
description = "At least 2 subnet IDs in different AZs (required for RDS subnet group)"
|
description = "Subnet IDs — both EC2 instances are placed in subnet_ids[0] (same AZ for low latency)"
|
||||||
type = list(string)
|
type = list(string)
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "db_password" {
|
variable "db_password" {
|
||||||
description = "Postgres master password"
|
description = "Unused — kept for tfvars compatibility. Local Postgres uses trust auth."
|
||||||
type = string
|
type = string
|
||||||
|
default = ""
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -54,7 +55,7 @@ variable "ec2_ami" {
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "scanning" {
|
variable "scanning" {
|
||||||
description = "Set to true during scanning phase, false for serving-only (tears down EC2, RDS, icons bucket)"
|
description = "Set to true during scanning phase, false for serving-only (tears down EC2 instances)"
|
||||||
type = bool
|
type = bool
|
||||||
default = true
|
default = true
|
||||||
}
|
}
|
||||||
|
|
@ -116,18 +117,32 @@ resource "aws_security_group" "ec2" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group" "rds" {
|
resource "aws_security_group" "db" {
|
||||||
count = var.scanning ? 1 : 0
|
count = var.scanning ? 1 : 0
|
||||||
name = "everytab-rds"
|
name = "everytab-db"
|
||||||
description = "EveryTab RDS instance"
|
description = "EveryTab DB instance (Postgres on NVMe)"
|
||||||
vpc_id = var.vpc_id
|
vpc_id = var.vpc_id
|
||||||
|
|
||||||
|
ingress {
|
||||||
|
from_port = 22
|
||||||
|
to_port = 22
|
||||||
|
protocol = "tcp"
|
||||||
|
cidr_blocks = [var.ssh_cidr]
|
||||||
|
}
|
||||||
|
|
||||||
ingress {
|
ingress {
|
||||||
from_port = 5432
|
from_port = 5432
|
||||||
to_port = 5432
|
to_port = 5432
|
||||||
protocol = "tcp"
|
protocol = "tcp"
|
||||||
security_groups = [aws_security_group.ec2[0].id]
|
security_groups = [aws_security_group.ec2[0].id]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
egress {
|
||||||
|
from_port = 0
|
||||||
|
to_port = 0
|
||||||
|
protocol = "-1"
|
||||||
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- IAM ---
|
# --- IAM ---
|
||||||
|
|
@ -310,31 +325,23 @@ resource "aws_s3_bucket_policy" "site" {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- RDS ---
|
# --- DB Instance (i3.large with local NVMe for Postgres) ---
|
||||||
|
|
||||||
resource "aws_db_subnet_group" "main" {
|
variable "db_instance_type" {
|
||||||
count = var.scanning ? 1 : 0
|
default = "i3.large"
|
||||||
name = "everytab"
|
|
||||||
subnet_ids = var.subnet_ids
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_db_instance" "main" {
|
resource "aws_instance" "db" {
|
||||||
count = var.scanning ? 1 : 0
|
count = var.scanning ? 1 : 0
|
||||||
identifier = "everytab"
|
ami = var.ec2_ami != "" ? var.ec2_ami : data.aws_ami.al2023.id
|
||||||
engine = "postgres"
|
instance_type = var.db_instance_type
|
||||||
engine_version = "16"
|
key_name = aws_key_pair.ec2[0].key_name
|
||||||
instance_class = "db.t3.medium"
|
vpc_security_group_ids = [aws_security_group.db[0].id]
|
||||||
allocated_storage = 20
|
subnet_id = var.subnet_ids[0]
|
||||||
storage_type = "gp3"
|
|
||||||
db_name = "everytab"
|
tags = {
|
||||||
username = "everytab"
|
Name = "everytab-db"
|
||||||
password = var.db_password
|
}
|
||||||
db_subnet_group_name = aws_db_subnet_group.main[0].name
|
|
||||||
vpc_security_group_ids = [aws_security_group.rds[0].id]
|
|
||||||
publicly_accessible = false
|
|
||||||
multi_az = false
|
|
||||||
backup_retention_period = 0
|
|
||||||
skip_final_snapshot = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- EC2 ---
|
# --- EC2 ---
|
||||||
|
|
@ -364,13 +371,16 @@ output "ec2_public_ip" {
|
||||||
value = var.scanning ? aws_instance.main[0].public_ip : null
|
value = var.scanning ? aws_instance.main[0].public_ip : null
|
||||||
}
|
}
|
||||||
|
|
||||||
output "rds_endpoint" {
|
output "db_private_ip" {
|
||||||
value = var.scanning ? aws_db_instance.main[0].endpoint : null
|
value = var.scanning ? aws_instance.db[0].private_ip : null
|
||||||
|
}
|
||||||
|
|
||||||
|
output "db_public_ip" {
|
||||||
|
value = var.scanning ? aws_instance.db[0].public_ip : null
|
||||||
}
|
}
|
||||||
|
|
||||||
output "database_url" {
|
output "database_url" {
|
||||||
value = var.scanning ? "postgres://everytab:${var.db_password}@${aws_db_instance.main[0].endpoint}/everytab" : null
|
value = var.scanning ? "postgres://everytab@${aws_instance.db[0].private_ip}:5432/everytab" : null
|
||||||
sensitive = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
output "ssh_private_key" {
|
output "ssh_private_key" {
|
||||||
|
|
@ -382,6 +392,10 @@ output "ssh_command" {
|
||||||
value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null
|
value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.main[0].public_ip}" : null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "ssh_command_db" {
|
||||||
|
value = var.scanning ? "ssh -i everytab-key ec2-user@${aws_instance.db[0].public_ip}" : null
|
||||||
|
}
|
||||||
|
|
||||||
output "cloudfront_domain" {
|
output "cloudfront_domain" {
|
||||||
value = aws_cloudfront_distribution.site.domain_name
|
value = aws_cloudfront_distribution.site.domain_name
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,8 @@ Between stages, run the sanity checks to confirm data looks right before proceed
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Database URL in environment
|
# Postgres on i3 instance (run infra/db-setup.sh on the i3 first)
|
||||||
export DATABASE_URL='postgres://everytab:PASS@RDS_ENDPOINT:5432/everytab'
|
export DATABASE_URL='postgres://everytab@<i3-private-ip>:5432/everytab'
|
||||||
|
|
||||||
# Schema created
|
|
||||||
psql $DATABASE_URL -f pipeline/01_cc_index/schema.sql
|
|
||||||
|
|
||||||
# Go binaries built on EC2
|
# Go binaries built on EC2
|
||||||
cd ~/everytab
|
cd ~/everytab
|
||||||
|
|
@ -39,10 +36,10 @@ Fetches WARC records from CC's S3, extracts titles, icons, and iframe headers.
|
||||||
|
|
||||||
## Stage 3: Icon Download
|
## Stage 3: Icon Download
|
||||||
|
|
||||||
Downloads favicons from the live web, validates, downloads to disk.
|
Downloads favicons from the live web, validates, writes to local disk.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir icons/ --log-errors-only
|
GOMEMLIMIT=12GiB ./icon_download --db "$DATABASE_URL" --log-file icon_download.log --icons-dir ~/icons --log-errors-only
|
||||||
```
|
```
|
||||||
|
|
||||||
## Stage 4: Best Icon Selection
|
## Stage 4: Best Icon Selection
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue