42 lines
1.2 KiB
SQL
42 lines
1.2 KiB
SQL
-- EveryTab database schema
|
|
-- Run once after RDS is available:
|
|
-- psql $DATABASE_URL -f schema.sql
|
|
|
|
CREATE TABLE IF NOT EXISTS hosts (
|
|
id SERIAL PRIMARY KEY,
|
|
hostname TEXT NOT NULL UNIQUE,
|
|
protocol TEXT NOT NULL,
|
|
crawl_id TEXT NOT NULL,
|
|
warc_filename TEXT NOT NULL,
|
|
warc_record_offset BIGINT NOT NULL,
|
|
warc_record_length INT NOT NULL,
|
|
html_title TEXT,
|
|
iframe_allowed BOOLEAN,
|
|
best_icon_s3_key TEXT,
|
|
parsed BOOLEAN DEFAULT FALSE
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS icons (
|
|
id SERIAL PRIMARY KEY,
|
|
host_id INT NOT NULL REFERENCES hosts(id),
|
|
url TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
rel_type TEXT,
|
|
rel_sizes TEXT,
|
|
content_type TEXT,
|
|
width INT,
|
|
height INT,
|
|
file_size INT,
|
|
s3_key TEXT,
|
|
scan_state TEXT DEFAULT 'unscanned',
|
|
error TEXT
|
|
);
|
|
|
|
-- Partial index: only unscanned icons (shrinks as work completes)
|
|
CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned';
|
|
|
|
-- For best-icon selection join
|
|
CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id);
|
|
|
|
-- For WARC parsing cursor
|
|
CREATE INDEX IF NOT EXISTS idx_hosts_unparsed ON hosts(id) WHERE parsed = FALSE;
|