download cc-index first with aws cli instead of streaming it
This commit is contained in:
parent
564919c5cc
commit
9308b5e039
1 changed files with 23 additions and 10 deletions
|
|
@ -2,8 +2,8 @@
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Query Common Crawl columnar index and populate the hosts table.
|
# Query Common Crawl columnar index and populate the hosts table.
|
||||||
# Uses DuckDB with S3 (credential chain) to read parquet files directly,
|
# Downloads CC-Index parquet files locally first (avoids S3 rate limits),
|
||||||
# and the postgres extension to write results into RDS.
|
# then uses DuckDB to query locally and write results into RDS.
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<'EOF'
|
cat <<'EOF'
|
||||||
|
|
@ -64,15 +64,10 @@ if [ "$LIMIT" -gt 0 ] 2>/dev/null; then
|
||||||
LIMIT_CLAUSE="LIMIT ${LIMIT}"
|
LIMIT_CLAUSE="LIMIT ${LIMIT}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/*.parquet"
|
S3_PATH="s3://commoncrawl/cc-index/table/cc-main/warc/crawl=${CRAWL}/subset=warc/"
|
||||||
|
LOCAL_INDEX="/tmp/cc-index"
|
||||||
|
|
||||||
QUERY="
|
QUERY="
|
||||||
INSTALL aws;
|
|
||||||
LOAD aws;
|
|
||||||
INSTALL httpfs;
|
|
||||||
LOAD httpfs;
|
|
||||||
CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN);
|
|
||||||
|
|
||||||
INSTALL postgres;
|
INSTALL postgres;
|
||||||
LOAD postgres;
|
LOAD postgres;
|
||||||
ATTACH '${DB_URL}' AS pg (TYPE POSTGRES);
|
ATTACH '${DB_URL}' AS pg (TYPE POSTGRES);
|
||||||
|
|
@ -85,7 +80,7 @@ SELECT
|
||||||
first(warc_filename ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_filename,
|
first(warc_filename ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_filename,
|
||||||
first(warc_record_offset ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_offset,
|
first(warc_record_offset ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_offset,
|
||||||
first(warc_record_length ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_length
|
first(warc_record_length ORDER BY CASE WHEN url_protocol = 'https' THEN 0 ELSE 1 END) AS warc_record_length
|
||||||
FROM read_parquet('${S3_PATH}')
|
FROM read_parquet('${LOCAL_INDEX}/*.parquet')
|
||||||
WHERE url_path = '/'
|
WHERE url_path = '/'
|
||||||
AND content_mime_type = 'text/html'
|
AND content_mime_type = 'text/html'
|
||||||
AND fetch_status = 200
|
AND fetch_status = 200
|
||||||
|
|
@ -113,6 +108,18 @@ echo "=== CC-Index Query ==="
|
||||||
echo "Crawl: $CRAWL"
|
echo "Crawl: $CRAWL"
|
||||||
echo "S3 path: $S3_PATH"
|
echo "S3 path: $S3_PATH"
|
||||||
echo "Limit: ${LIMIT} (0 = no limit)"
|
echo "Limit: ${LIMIT} (0 = no limit)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Download CC-Index parquet files locally (avoids S3 rate limits during query)
|
||||||
|
if [ -d "$LOCAL_INDEX" ] && [ "$(ls -A "$LOCAL_INDEX"/*.parquet 2>/dev/null)" ]; then
|
||||||
|
echo "Using cached CC-Index at $LOCAL_INDEX"
|
||||||
|
else
|
||||||
|
echo "Downloading CC-Index parquet files..."
|
||||||
|
mkdir -p "$LOCAL_INDEX"
|
||||||
|
aws s3 sync "$S3_PATH" "$LOCAL_INDEX/" --quiet
|
||||||
|
echo "Downloaded $(ls "$LOCAL_INDEX"/*.parquet | wc -l) parquet files"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Starting query..."
|
echo "Starting query..."
|
||||||
echo ""
|
echo ""
|
||||||
|
|
@ -154,3 +161,9 @@ SELECT json_build_object(
|
||||||
);
|
);
|
||||||
" > "$STATS_FILE"
|
" > "$STATS_FILE"
|
||||||
echo "Stats written to $STATS_FILE"
|
echo "Stats written to $STATS_FILE"
|
||||||
|
|
||||||
|
# Clean up local CC-Index files
|
||||||
|
echo ""
|
||||||
|
echo "Cleaning up $LOCAL_INDEX..."
|
||||||
|
rm -rf "$LOCAL_INDEX"
|
||||||
|
echo "Done"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue