added infra setup with terraform
This commit is contained in:
parent
64ae58494b
commit
fcf203e1d8
8 changed files with 556 additions and 74 deletions
42
pipeline/01_cc_index/schema.sql
Normal file
42
pipeline/01_cc_index/schema.sql
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
-- EveryTab database schema
|
||||
-- Run once after RDS is available:
|
||||
-- psql $DATABASE_URL -f schema.sql
|
||||
|
||||
CREATE TABLE IF NOT EXISTS hosts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
hostname TEXT NOT NULL UNIQUE,
|
||||
protocol TEXT NOT NULL,
|
||||
crawl_id TEXT NOT NULL,
|
||||
warc_filename TEXT NOT NULL,
|
||||
warc_record_offset BIGINT NOT NULL,
|
||||
warc_record_length INT NOT NULL,
|
||||
html_title TEXT,
|
||||
iframe_allowed BOOLEAN,
|
||||
best_icon_s3_key TEXT,
|
||||
parsed BOOLEAN DEFAULT FALSE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS icons (
|
||||
id SERIAL PRIMARY KEY,
|
||||
host_id INT NOT NULL REFERENCES hosts(id),
|
||||
url TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
rel_type TEXT,
|
||||
rel_sizes TEXT,
|
||||
content_type TEXT,
|
||||
width INT,
|
||||
height INT,
|
||||
file_size INT,
|
||||
s3_key TEXT,
|
||||
scan_state TEXT DEFAULT 'unscanned',
|
||||
error TEXT
|
||||
);
|
||||
|
||||
-- Partial index: only unscanned icons (shrinks as work completes)
|
||||
CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned';
|
||||
|
||||
-- For best-icon selection join
|
||||
CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id);
|
||||
|
||||
-- For WARC parsing cursor
|
||||
CREATE INDEX IF NOT EXISTS idx_hosts_unparsed ON hosts(id) WHERE parsed = FALSE;
|
||||
Loading…
Add table
Add a link
Reference in a new issue