CREATE TABLE kvs ( id bigserial PRIMARY KEY, t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'), key varchar(200) NOT NULL UNIQUE, value jsonb ) ---- COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry'; ---- COMMENT ON COLUMN kvs.key IS 'Key'; ---- COMMENT ON COLUMN kvs.value IS 'Value'; ---- COMMENT ON TABLE kvs IS 'Simple key-value store'; ---- INSERT INTO kvs (key, value) VALUES ('schema_version', '1'); ---- CREATE TABLE site ( id bigserial PRIMARY KEY, canonical_url varchar(200), base_url varchar(200) NOT NULL, base_urls varchar(200)[] NOT NULL, domains varchar(100)[], ips inet[] NULL, crawl_enabled bool NOT NULL DEFAULT false, crawl_active bool NOT NULL DEFAULT false, next_full_crawl timestamp, next_feed_crawl timestamp, last_update timestamp, last_pub timestamp, pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb, langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[], alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb, title varchar(200), description varchar(2000), keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[], linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb, meta_info jsonb NOT NULL DEFAULT '{}'::jsonb, boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb ) ---- CREATE INDEX site__base_url ON site (base_url) ---- CREATE INDEX site__base_urls ON site (base_urls) ---- CREATE INDEX site__domains ON site (domains) ---- CREATE INDEX site__ips ON site (ips) ---- CREATE INDEX site__next_full_crawl ON site (next_full_crawl) ---- CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl) ---- CREATE INDEX site__langs ON site (langs) ---- CREATE INDEX site__title ON site (title) ---- CREATE INDEX site__description ON site (description) ---- CREATE INDEX site__keywords ON site (keywords) ---- COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)' ---- COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content' ---- COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content' ---- COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls' ---- COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed' ---- COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress' ---- COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null' ---- COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null' ---- COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)' ---- COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site' ---- COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date' ---- COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)' ---- COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes' ---- COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags' ---- COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags' ---- COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags' ---- COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)' ---- COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information' ---- COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages' ---- COMMENT ON TABLE site IS 'Website' ---- CREATE TABLE site_queue ( id bigserial PRIMARY KEY, src bigint NULL REFERENCES site(id) ON DELETE CASCADE, url varchar(200) NOT NULL, link_text varchar(100), t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc') ) ---- CREATE INDEX site_queue__url ON site_queue (url) ---- COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions' ---- COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path' ---- COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site' ---- COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry' ---- COMMENT ON TABLE site_queue IS 'Queued site URLs' ---- CREATE TABLE site_feed ( id bigserial PRIMARY KEY, site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, url varchar(200) NOT NULL, etag text, modified varchar(50), t_visit timestamp, t_content timestamp, version varchar(10), title varchar(200), description text, fail_count smallint NOT NULL DEFAULT 0 ) ---- CREATE INDEX site_feed__site ON site_feed (site_id) ---- CREATE INDEX site_feed__t_content ON site_feed (t_content) ---- COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found' ---- COMMENT ON COLUMN site_feed.url IS 'URL of the feed' ---- COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed' ---- COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed' ---- COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival' ---- COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval' ---- COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival' ---- COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival' ---- COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival' ---- COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival' ---- CREATE TABLE site_link ( id bigserial PRIMARY KEY, src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'), link_text varchar(100) ) ---- ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst) ---- CREATE INDEX site_link__src ON site_link (src) ---- CREATE INDEX site_link__dst ON site_link (dst) ---- COMMENT ON COLUMN site_link.src IS 'Source site' ---- COMMENT ON COLUMN site_link.dst IS 'Destination site' ---- COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry' ---- COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site' ---- COMMENT ON TABLE site_link IS 'Cross-site link' ---- CREATE TABLE resource ( id bigserial PRIMARY KEY, simhash bigint, content_type varchar(50), last_change timestamp, text_len int, lang char(2), title varchar(200), summary varchar(2000) ) ---- COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource' ---- COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header' ---- COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource' ---- COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters' ---- COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code' ---- COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)' ---- COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)' ---- COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)' ---- CREATE TABLE site_path ( id bigserial PRIMARY KEY, site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, path varchar(400) NOT NULL, last_visit timestamp, filtered bool NOT NULL DEFAULT false, ok_count smallint NOT NULL DEFAULT 0, canonical bool, resource_id bigint REFERENCES resource(id) ON DELETE CASCADE ) ---- ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path) ---- CREATE INDEX site_path__site_path ON site_path (site_id, path) ---- CREATE INDEX site_path__resource ON site_path (resource_id) ---- COMMENT ON COLUMN site_path.site_id IS 'Site id' ---- COMMENT ON COLUMN site_path.path IS 'Path' ---- COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival' ---- COMMENT ON COLUMN site_path.filtered IS 'Whether the path was filtered' ---- COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed' ---- COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival' ---- COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval' ---- COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources' ---- CREATE TABLE crawl ( id bigserial PRIMARY KEY, site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, is_full bool NOT NULL DEFAULT false, t_begin timestamp, t_end timestamp, n_resources int NOT NULL DEFAULT 0, n_resources_new int NOT NULL DEFAULT 0 ) ---- CREATE INDEX crawl__site ON crawl (site_id) ---- CREATE INDEX crawl__t_begin ON crawl (t_begin) ---- COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled' ---- COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl' ---- COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl' ---- COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin' ---- COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl' ---- COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl' ---- COMMENT ON TABLE resource IS 'Crawl of resources on a site' ---- CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale') ---- COMMENT ON TYPE site_annotation_type IS 'Type of site annotation' ---- CREATE TABLE site_annotation ( id bigserial PRIMARY KEY, site_id bigint REFERENCES site(id) ON DELETE SET NULL, base_url varchar(200) NOT NULL, ann_type site_annotation_type NOT NULL, ann_content JSONB, t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc') ) ---- CREATE INDEX site_annotation__site ON site_annotation (site_id) ---- CREATE INDEX site_annotation__base_url ON site_annotation (base_url) ---- COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated' ---- COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated' ---- COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type' ---- COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content' ---- COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update' ---- COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'