From 9308b5e0394ed6aec7398123feea3b4f0a81e960 Mon Sep 17 00:00:00 2001
From: Joe Lothan <joe@lothan.net>
Date: Wed, 20 May 2026 08:14:22 -0400
Subject: [PATCH] download cc-index first with aws cli instead of streaming it

---
 pipeline/01_cc_index/query.sh | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/pipeline/01_cc_index/query.sh b/pipeline/01_cc_index/query.sh
index 466b8bf..c485b48 100755
--- a/pipeline/01_cc_index/query.sh
+++ b/pipeline/01_cc_index/query.sh
@@ -2,8 +2,8 @@
 set -euo pipefail
 
 # Query Common Crawl columnar index and populate the hosts table.
-# Uses DuckDB with S3 (credential chain) to read parquet files directly,
-# and the postgres extension to write results into RDS.
+# Downloads CC-Index parquet files locally first (avoids S3 rate limits),
+# then uses DuckDB to query locally and write results into RDS.
 
 usage() {
     cat <<'EOF'
@@ -64,15 +64,10 @@ if [ "$LIMIT" -gt 0 ] 2>/dev/null; then
     LIMIT_CLAUSE="LIMIT ${LIMIT}"
 fi
 
-S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/*.parquet"
+S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/"
+LOCAL_INDEX="/tmp/cc-index"
 
 QUERY="
-INSTALL aws;
-LOAD aws;
-INSTALL httpfs;
-LOAD httpfs;
-CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN);
-
 INSTALL postgres;
 LOAD postgres;
 ATTACH '${DB_URL}' AS pg (TYPE POSTGRES);
@@ -85,7 +80,7 @@ SELECT
     first(warc_filename ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_filename,
     first(warc_record_offset ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_offset,
     first(warc_record_length ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_length
-FROM read_parquet('${S3_PATH}')
+FROM read_parquet('${LOCAL_INDEX}/*.parquet')
 WHERE url_path = '/'
   AND content_mime_type = 'text/html'
   AND fetch_status = 200
@@ -113,6 +108,18 @@ echo "=== CC-Index Query ==="
 echo "Crawl: $CRAWL"
 echo "S3 path: $S3_PATH"
 echo "Limit: ${LIMIT} (0 = no limit)"
+echo ""
+
+# Download CC-Index parquet files locally (avoids S3 rate limits during query)
+if [ -d "$LOCAL_INDEX" ] && [ "$(ls -A "$LOCAL_INDEX"/*.parquet 2>/dev/null)" ]; then
+    echo "Using cached CC-Index at $LOCAL_INDEX"
+else
+    echo "Downloading CC-Index parquet files..."
+    mkdir -p "$LOCAL_INDEX"
+    aws s3 sync "$S3_PATH" "$LOCAL_INDEX/" --quiet
+    echo "Downloaded $(ls "$LOCAL_INDEX"/*.parquet | wc -l) parquet files"
+fi
+
 echo ""
 echo "Starting query..."
 echo ""
@@ -154,3 +161,9 @@ SELECT json_build_object(
 );
 " > "$STATS_FILE"
 echo "Stats written to $STATS_FILE"
+
+# Clean up local CC-Index files
+echo ""
+echo "Cleaning up $LOCAL_INDEX..."
+rm -rf "$LOCAL_INDEX"
+echo "Done"