From 9308b5e0394ed6aec7398123feea3b4f0a81e960 Mon Sep 17 00:00:00 2001 From: Joe Lothan Date: Wed, 20 May 2026 08:14:22 -0400 Subject: [PATCH] download cc-index first with aws cli instead of streaming it --- pipeline/01_cc_index/query.sh | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pipeline/01_cc_index/query.sh b/pipeline/01_cc_index/query.sh index 466b8bf..c485b48 100755 --- a/pipeline/01_cc_index/query.sh +++ b/pipeline/01_cc_index/query.sh @@ -2,8 +2,8 @@ set -euo pipefail # Query Common Crawl columnar index and populate the hosts table. -# Uses DuckDB with S3 (credential chain) to read parquet files directly, -# and the postgres extension to write results into RDS. +# Downloads CC-Index parquet files locally first (avoids S3 rate limits), +# then uses DuckDB to query locally and write results into RDS. usage() { cat <<'EOF' @@ -64,15 +64,10 @@ if [ "$LIMIT" -gt 0 ] 2>/dev/null; then LIMIT_CLAUSE="LIMIT ${LIMIT}" fi -S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/*.parquet" +S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/" +LOCAL_INDEX="/tmp/cc-index" QUERY=" -INSTALL aws; -LOAD aws; -INSTALL httpfs; -LOAD httpfs; -CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN); - INSTALL postgres; LOAD postgres; ATTACH '${DB_URL}' AS pg (TYPE POSTGRES); @@ -85,7 +80,7 @@ SELECT first(warc_filename ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_filename, first(warc_record_offset ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_offset, first(warc_record_length ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_length -FROM read_parquet('${S3_PATH}') +FROM read_parquet('${LOCAL_INDEX}/*.parquet') WHERE url_path = '/' AND content_mime_type = 'text/html' AND fetch_status = 200 @@ -113,6 +108,18 @@ echo "=== CC-Index Query ===" echo "Crawl: $CRAWL" echo "S3 path: $S3_PATH" echo "Limit: ${LIMIT} (0 = no limit)" +echo "" + +# Download CC-Index parquet files locally (avoids S3 rate limits during query) +if [ -d "$LOCAL_INDEX" ] && [ "$(ls -A "$LOCAL_INDEX"/*.parquet 2>/dev/null)" ]; then + echo "Using cached CC-Index at $LOCAL_INDEX" +else + echo "Downloading CC-Index parquet files..." + mkdir -p "$LOCAL_INDEX" + aws s3 sync "$S3_PATH" "$LOCAL_INDEX/" --quiet + echo "Downloaded $(ls "$LOCAL_INDEX"/*.parquet | wc -l) parquet files" +fi + echo "" echo "Starting query..." echo "" @@ -154,3 +161,9 @@ SELECT json_build_object( ); " > "$STATS_FILE" echo "Stats written to $STATS_FILE" + +# Clean up local CC-Index files +echo "" +echo "Cleaning up $LOCAL_INDEX..." +rm -rf "$LOCAL_INDEX" +echo "Done"