-- EveryTab database schema -- Run once after RDS is available: -- psql $DATABASE_URL -f schema.sql CREATE TABLE IF NOT EXISTS hosts ( id SERIAL PRIMARY KEY, hostname TEXT NOT NULL UNIQUE, protocol TEXT NOT NULL, crawl_id TEXT NOT NULL, warc_filename TEXT NOT NULL, warc_record_offset BIGINT NOT NULL, warc_record_length INT NOT NULL, html_title TEXT, iframe_allowed BOOLEAN, best_icon_hash TEXT, icon_downloaded_at TIMESTAMPTZ, parsed BOOLEAN DEFAULT FALSE, random_order DOUBLE PRECISION DEFAULT random() ); CREATE TABLE IF NOT EXISTS icons ( id SERIAL PRIMARY KEY, host_id INT NOT NULL REFERENCES hosts(id), url TEXT NOT NULL, source TEXT NOT NULL, rel_type TEXT, rel_sizes TEXT, content_type TEXT, width INT, height INT, file_size INT, icon_hash TEXT, scan_state TEXT DEFAULT 'unscanned', error TEXT, downloaded_at TIMESTAMPTZ ); -- Partial index: only unscanned icons (shrinks as work completes) CREATE INDEX IF NOT EXISTS idx_icons_unscanned ON icons(id) WHERE scan_state = 'unscanned'; -- For best-icon selection join CREATE INDEX IF NOT EXISTS idx_icons_host_id ON icons(host_id); -- For WARC parsing cursor CREATE INDEX IF NOT EXISTS idx_hosts_unparsed ON hosts(id) WHERE parsed = FALSE; -- For bundle generation pagination in random order CREATE INDEX IF NOT EXISTS idx_hosts_random ON hosts(random_order);