atextcrawler/src/atextcrawler/migrations/1.sql

300 lines
11 KiB
SQL

CREATE TABLE kvs (
id bigserial PRIMARY KEY,
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
key varchar(200) NOT NULL UNIQUE,
value jsonb
)
----
COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
----
COMMENT ON COLUMN kvs.key IS 'Key';
----
COMMENT ON COLUMN kvs.value IS 'Value';
----
COMMENT ON TABLE kvs IS 'Simple key-value store';
----
INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
----
CREATE TABLE site (
id bigserial PRIMARY KEY,
canonical_url varchar(200),
base_url varchar(200) NOT NULL,
base_urls varchar(200)[] NOT NULL,
domains varchar(100)[],
ips inet[] NULL,
crawl_enabled bool NOT NULL DEFAULT false,
crawl_active bool NOT NULL DEFAULT false,
next_full_crawl timestamp,
next_feed_crawl timestamp,
last_update timestamp,
last_pub timestamp,
pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
title varchar(200),
description varchar(2000),
keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
)
----
CREATE INDEX site__base_url ON site (base_url)
----
CREATE INDEX site__base_urls ON site (base_urls)
----
CREATE INDEX site__domains ON site (domains)
----
CREATE INDEX site__ips ON site (ips)
----
CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
----
CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
----
CREATE INDEX site__langs ON site (langs)
----
CREATE INDEX site__title ON site (title)
----
CREATE INDEX site__description ON site (description)
----
CREATE INDEX site__keywords ON site (keywords)
----
COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
----
COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
----
COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
----
COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
----
COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
----
COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
----
COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
----
COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
----
COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
----
COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
----
COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
----
COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
----
COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
----
COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
----
COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
----
COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
----
COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
----
COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
----
COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
----
COMMENT ON TABLE site IS 'Website'
----
CREATE TABLE site_queue (
id bigserial PRIMARY KEY,
src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
url varchar(200) NOT NULL,
link_text varchar(100),
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
)
----
CREATE INDEX site_queue__url ON site_queue (url)
----
COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
----
COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
----
COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
----
COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
----
COMMENT ON TABLE site_queue IS 'Queued site URLs'
----
CREATE TABLE site_feed (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
url varchar(200) NOT NULL,
etag text,
modified varchar(50),
t_visit timestamp,
t_content timestamp,
version varchar(10),
title varchar(200),
description text,
fail_count smallint NOT NULL DEFAULT 0
)
----
CREATE INDEX site_feed__site ON site_feed (site_id)
----
CREATE INDEX site_feed__t_content ON site_feed (t_content)
----
COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
----
COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
----
COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
----
COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
----
COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
----
COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
----
CREATE TABLE site_link (
id bigserial PRIMARY KEY,
src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
link_text varchar(100)
)
----
ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
----
CREATE INDEX site_link__src ON site_link (src)
----
CREATE INDEX site_link__dst ON site_link (dst)
----
COMMENT ON COLUMN site_link.src IS 'Source site'
----
COMMENT ON COLUMN site_link.dst IS 'Destination site'
----
COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
----
COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
----
COMMENT ON TABLE site_link IS 'Cross-site link'
----
CREATE TABLE resource (
id bigserial PRIMARY KEY,
simhash bigint,
content_type varchar(50),
last_change timestamp,
text_len int,
lang char(2),
title varchar(200),
summary varchar(2000)
)
----
COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
----
COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
----
COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
----
COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
----
COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
----
COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
----
COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
----
COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
----
CREATE TABLE site_path (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
path varchar(400) NOT NULL,
last_visit timestamp,
filtered bool NOT NULL DEFAULT false,
ok_count smallint NOT NULL DEFAULT 0,
canonical bool,
resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
)
----
ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
----
CREATE INDEX site_path__site_path ON site_path (site_id, path)
----
CREATE INDEX site_path__resource ON site_path (resource_id)
----
COMMENT ON COLUMN site_path.site_id IS 'Site id'
----
COMMENT ON COLUMN site_path.path IS 'Path'
----
COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
----
COMMENT ON COLUMN site_path.filtered IS 'Whether the path was filtered'
----
COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
----
COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
----
COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
----
COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
----
CREATE TABLE crawl (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
is_full bool NOT NULL DEFAULT false,
t_begin timestamp,
t_end timestamp,
n_resources int NOT NULL DEFAULT 0,
n_resources_new int NOT NULL DEFAULT 0
)
----
CREATE INDEX crawl__site ON crawl (site_id)
----
CREATE INDEX crawl__t_begin ON crawl (t_begin)
----
COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
----
COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
----
COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
----
COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
----
COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
----
COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
----
COMMENT ON TABLE resource IS 'Crawl of resources on a site'
----
CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
----
COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
----
CREATE TABLE site_annotation (
id bigserial PRIMARY KEY,
site_id bigint REFERENCES site(id) ON DELETE SET NULL,
base_url varchar(200) NOT NULL,
ann_type site_annotation_type NOT NULL,
ann_content JSONB,
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
)
----
CREATE INDEX site_annotation__site ON site_annotation (site_id)
----
CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
----
COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
----
COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
----
COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
----
COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
----
COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
----
COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'