From db81015e0b16bfa59ce37ea4411b56a02241411e Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Sun, 17 May 2026 19:12:25 -0400 Subject: [PATCH] added query.sh to read the cc-index from s3 parquet files and dump it into our psql db --- ARCHITECTURE.md | 2 +- PLAN.md | 44 +++++----- pipeline/01_cc_index/query.sh | 156 ++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 23 deletions(-) create mode 100755 pipeline/01_cc_index/query.sh diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 5b8115e..8912e2a 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -171,7 +171,7 @@ WHERE url_path = '/' AND fetch_status = 200 AND url_query IS NULL AND url_protocol IN ('http', 'https') - AND url_port IN (80, 443) + AND url_port IS NULL ``` **Deduplication:** Per hostname, prefer `https` over `http`. Result is one row per unique hostname. diff --git a/PLAN.md b/PLAN.md index 2c4d1a7..61a9b7c 100644 --- a/PLAN.md +++ b/PLAN.md @@ -102,32 +102,15 @@ CREATE INDEX idx_icons_host_id ON icons(host_id); **Done when:** Tables exist in RDS, schema matches ARCHITECTURE.md. -### Step 1.2: DuckDB CC-Index Query (100K limit) +### Step 1.2: DuckDB CC-Index Query (100K limit) [COMPLETED] -Write `pipeline/01_cc_index/query.sql` (or a shell script wrapping DuckDB CLI). +Script: `pipeline/01_cc_index/query.sh` -The script: -1. Connects DuckDB to RDS via the postgres extension -2. Queries the CC-Index parquet files via httpfs (latest crawl) -3. Filters per ARCHITECTURE.md criteria -4. Deduplicates per hostname (prefer https) -5. Limits to 100,000 rows for dev -6. Inserts directly into the hosts table +Uses DuckDB with `aws` extension (credential chain) to read parquet directly from `s3://commoncrawl/.../*.parquet` glob, with the `postgres` extension to write results into RDS. Auto-detects latest crawl ID from the CC API. -Key considerations: -- Find the latest crawl index path (e.g., `s3://commoncrawl/cc-index/collections/CC-MAIN-2026-05/indexes/cdx-00*.parquet` — verify actual path structure) -- DuckDB postgres extension: `INSTALL postgres; LOAD postgres; ATTACH 'dbname=... host=... ...' AS pg (TYPE POSTGRES);` -- The dedup logic: partition by hostname, order by protocol (https first), take first row -- Add `LIMIT 100000` for dev, remove for full run -- Time the query — if httpfs takes >1hr, switch to downloading parquet first +Deduplication via `GROUP BY url_host_name` with `first(... ORDER BY ...)` aggregates (hash aggregation — more memory-efficient than window functions). -**Validation:** -- `SELECT COUNT(*) FROM hosts;` returns ~100,000 -- `SELECT protocol, COUNT(*) FROM hosts GROUP BY protocol;` shows mostly https -- `SELECT * FROM hosts LIMIT 5;` shows sane data (real hostnames, valid WARC paths) -- Spot-check: pick a few hostnames, verify they're real websites - -**Stats to emit:** `stats/01_cc_index.json` — includes: started_at, finished_at, duration_seconds, total_domains, https_count, http_count. +**Result:** 100K hosts, 77% https / 23% http, completed in 692s. **Done when:** 100K hosts in the database with valid WARC coordinates. @@ -694,3 +677,20 @@ On completion, each program prints a summary line and writes its stats JSON (wit - Amazon Linux 2023 uses `systemd-resolved` which manages `/etc/resolv.conf`. Must disable it before pointing resolv.conf at Unbound. `chattr +i` doesn't work on the symlink. - AWS EC2 key pairs created via API don't support passphrases. Use `tls_private_key` in Terraform or generate locally with `ssh-keygen` + import. - When an AWS key pair name already exists from a previous run, Terraform may not regenerate it. Use `-replace` to force recreation of the key + instance together. + +### Phase 1 (Steps 1.1-1.2) — Completed 2026-05-17 + +**Changes from original plan:** +- Used DuckDB `aws` extension with `CREDENTIAL_CHAIN` instead of httpfs anonymous access. The commoncrawl S3 bucket requires authenticated requests. +- IAM role needed explicit `s3:GetObject` and `s3:ListBucket` on `arn:aws:s3:::commoncrawl/*` — the bucket doesn't allow cross-account access based on bucket policy alone. +- Used `GROUP BY` with `first(... ORDER BY ...)` instead of `ROW_NUMBER()` window function. More memory-efficient (hash aggregation vs sort), cleaner syntax. +- DuckDB can glob `s3://.../subset=warc/*.parquet` directly (300 files) — no need to fetch a file list or download parquet locally. +- Dropped the `url_port IN (80, 443)` filter — CC stores standard ports as NULL, not 80/443. Replaced with `url_port IS NULL`. + +**Lessons learned:** +- DuckDB URL-encodes `=` in S3 paths (e.g., `crawl%3DCC-MAIN-2026-17`) but S3 decodes it correctly. The real issue was always IAM permissions, not path encoding. +- The `commoncrawl` S3 bucket requires valid AWS credentials for both GetObject and ListBucket. Anonymous access (unsigned requests) does not work. Any valid IAM identity works as long as their policy allows it. +- DuckDB's LIMIT can interact unexpectedly with GROUP BY — the optimizer may stop reading input early once it has enough groups. This wasn't our issue (it was the port filter) but worth noting for future queries. +- CC-Index stores `url_port` as NULL for standard ports (80/443), not as the integer. Always check actual column values before writing filters. +- c5.xlarge (8GB) is tight for this query — uses 6.4GB + swap. For the full 30M run, use c5.2xlarge (16GB). +- Query takes ~692s (11.5 min) for 100K output rows reading all 300 parquet files. Full run without LIMIT will be similar duration but more memory for the hash table. diff --git a/pipeline/01_cc_index/query.sh b/pipeline/01_cc_index/query.sh new file mode 100755 index 0000000..466b8bf --- /dev/null +++ b/pipeline/01_cc_index/query.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Query Common Crawl columnar index and populate the hosts table. +# Uses DuckDB with S3 (credential chain) to read parquet files directly, +# and the postgres extension to write results into RDS. + +usage() { + cat <<'EOF' +Usage: ./query.sh --db-url DATABASE_URL [OPTIONS] + +Required: + --db-url URL Postgres connection string + +Optional: + --crawl ID Common Crawl crawl ID (default: latest) + --limit N Max rows to insert (default: 100000, use 0 for no limit) + --dry-run Print the query without executing + --help Show this help message + +Example: + ./query.sh --db-url "$DATABASE_URL" + ./query.sh --db-url "$DATABASE_URL" --limit 0 # full 30M scan + ./query.sh --db-url "$DATABASE_URL" --dry-run +EOF + exit 0 +} + +# Defaults +DB_URL="" +CRAWL="" +LIMIT=100000 +DRY_RUN=false + +# Parse args +if [ $# -eq 0 ]; then usage; fi + +while [ $# -gt 0 ]; do + case "$1" in + --help) usage ;; + --db-url) DB_URL="$2"; shift 2 ;; + --crawl) CRAWL="$2"; shift 2 ;; + --limit) LIMIT="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) echo "Unknown option: $1"; usage ;; + esac +done + +if [ -z "$DB_URL" ]; then + echo "ERROR: --db-url is required" + exit 1 +fi + +# Auto-detect latest crawl if not specified +if [ -z "$CRAWL" ]; then + echo "Fetching latest crawl ID..." + CRAWL=$(curl -s https://index.commoncrawl.org/collinfo.json | jq -r '.[0].id' | sed 's/-index$//') + echo "Using latest crawl: $CRAWL" +fi + +# Build the limit clause +LIMIT_CLAUSE="" +if [ "$LIMIT" -gt 0 ] 2>/dev/null; then + LIMIT_CLAUSE="LIMIT ${LIMIT}" +fi + +S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/*.parquet" + +QUERY=" +INSTALL aws; +LOAD aws; +INSTALL httpfs; +LOAD httpfs; +CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN); + +INSTALL postgres; +LOAD postgres; +ATTACH '${DB_URL}' AS pg (TYPE POSTGRES); + +INSERT INTO pg.public.hosts (hostname, protocol, crawl_id, warc_filename, warc_record_offset, warc_record_length) +SELECT + url_host_name AS hostname, + first(url_protocol ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS protocol, + '${CRAWL}' AS crawl_id, + first(warc_filename ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_filename, + first(warc_record_offset ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_offset, + first(warc_record_length ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_length +FROM read_parquet('${S3_PATH}') +WHERE url_path = '/' + AND content_mime_type = 'text/html' + AND fetch_status = 200 + AND url_query IS NULL + AND url_protocol IN ('http', 'https') + AND url_port IS NULL + AND url_host_name IS NOT NULL + AND url_host_name != '' +GROUP BY url_host_name +${LIMIT_CLAUSE}; +" + +if $DRY_RUN; then + echo "=== DRY RUN ===" + echo "Crawl: $CRAWL" + echo "S3 path: $S3_PATH" + echo "Limit: ${LIMIT} (0 = no limit)" + echo "" + echo "Query:" + echo "$QUERY" + exit 0 +fi + +echo "=== CC-Index Query ===" +echo "Crawl: $CRAWL" +echo "S3 path: $S3_PATH" +echo "Limit: ${LIMIT} (0 = no limit)" +echo "" +echo "Starting query..." +echo "" + +START_TIME=$(date +%s) + +duckdb -c "$QUERY" + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +echo "" +echo "Query completed in ${DURATION}s" +echo "" + +# --- Stats --- +echo "--- Validation ---" +STATS=$(psql "$DB_URL" -t -A -c " +SELECT json_build_object( + 'total_hosts', (SELECT COUNT(*) FROM hosts), + 'https_count', (SELECT COUNT(*) FROM hosts WHERE protocol = 'https'), + 'http_count', (SELECT COUNT(*) FROM hosts WHERE protocol = 'http') +); +") +echo "$STATS" | jq . + +STATS_FILE="stats/01_cc_index.json" +mkdir -p stats +psql "$DB_URL" -t -A -c " +SELECT json_build_object( + 'started_at', '$(date -d @$START_TIME -Iseconds)', + 'finished_at', '$(date -d @$END_TIME -Iseconds)', + 'duration_seconds', ${DURATION}, + 'crawl_id', '${CRAWL}', + 'limit_applied', ${LIMIT}, + 'total_hosts', (SELECT COUNT(*) FROM hosts), + 'https_count', (SELECT COUNT(*) FROM hosts WHERE protocol = 'https'), + 'http_count', (SELECT COUNT(*) FROM hosts WHERE protocol = 'http') +); +" > "$STATS_FILE" +echo "Stats written to $STATS_FILE"