everytab/pipeline/01_cc_index/schema.sql

48 lines
1.4 KiB
SQL

-- EveryTab database schema
-- Run once after RDS is available:
-- psql $DATABASE_URL -f schema.sql
CREATE TABLE IF NOT EXISTS hosts (
id SERIAL PRIMARY KEY,
hostname TEXT NOT NULL UNIQUE,
protocol TEXT NOT NULL,
crawl_id TEXT NOT NULL,
warc_filename TEXT NOT NULL,
warc_record_offset BIGINT NOT NULL,
warc_record_length INT NOT NULL,
html_title TEXT,
iframe_allowed BOOLEAN,
best_icon_hash TEXT,
icon_downloaded_at TIMESTAMPTZ,
parsed BOOLEAN DEFAULT FALSE,
random_order DOUBLE PRECISION DEFAULT random()
);
CREATE TABLE IF NOT EXISTS icons (
id SERIAL PRIMARY KEY,
host_id INT NOT NULL REFERENCES hosts(id),
url TEXT NOT NULL,
source TEXT NOT NULL,
rel_type TEXT,
rel_sizes TEXT,
content_type TEXT,
width INT,
height INT,
file_size INT,
icon_hash TEXT,
scan_state TEXT DEFAULT 'unscanned',
error TEXT,
downloaded_at TIMESTAMPTZ
);
-- Partial index: only unscanned icons (shrinks as work completes)
CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned';
-- For best-icon selection join
CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id);
-- For WARC parsing cursor
CREATE INDEX IF NOT EXISTS idx_hosts_unparsed ON hosts(id) WHERE parsed = FALSE;
-- For bundle generation pagination in random order
CREATE INDEX IF NOT EXISTS idx_hosts_random ON hosts(random_order);