1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
-- records are implied to be 200 status (or 226 for ftp); either direct hits or
-- revisits
-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
-- datetimes. import scripts should take efforts to reduce this sort of
-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
-- all ingest result url/dt pairs should be included though.
-- any mimetype is allowed, but presumption should be that actual body is full
-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
-- only a part of work). URLs that are parts of a fileset are allowed.
CREATE TABLE IF NOT EXISTS cdx (
url TEXT NOT NULL CHECK (octet_length(url) >= 1),
datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
-- sha1hex/cdx_sha1hex difference is intended to help with difference between
-- CDX hash (which is transport encoded body) vs. actual body. Probably need to
-- include both for all records?
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
mimetype TEXT CHECK (octet_length(mimetype) >= 1),
-- TODO: enforce that only paths with '/' (item+file) should be included?
warc_path TEXT CHECK (octet_length(warc_path) >= 1),
warc_csize BIGINT,
warc_offset BIGINT,
row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
PRIMARY KEY(url, datetime)
);
CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
-- TODO: remove this index? not currently used
CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
-- TODO: require all fields. if mimetype unknown, should be octet-stream
CREATE TABLE IF NOT EXISTS file_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
md5hex TEXT CHECK (octet_length(md5hex) = 32),
size_bytes BIGINT,
mimetype TEXT CHECK (octet_length(mimetype) >= 1)
);
CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
CREATE TABLE IF NOT EXISTS fatcat_file (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
file_ident TEXT CHECK (octet_length(file_ident) = 26),
first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
);
CREATE TABLE IF NOT EXISTS petabox (
item TEXT NOT NULL CHECK (octet_length(item) >= 1),
path TEXT NOT NULL CHECK (octet_length(path) >= 1),
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
PRIMARY KEY(item, path)
);
CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
CREATE TABLE IF NOT EXISTS grobid (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
status_code INT NOT NULL,
status TEXT CHECK (octet_length(status) >= 1),
fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
-- extracted basic biblio metadata:
-- title
-- authors[]
-- full/display
-- given_name
-- surname
-- affiliation
-- year
-- journal_issn
-- journal_name
-- refs_count
metadata JSONB
);
-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
CREATE TABLE IF NOT EXISTS pdftrio (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status_code INT NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
models_date DATE,
ensemble_score REAL,
bert_score REAL,
linear_score REAL,
image_score REAL
);
CREATE TABLE IF NOT EXISTS pdf_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
has_page0_thumbnail BOOLEAN NOT NULL,
page_count INT CHECK (page_count >= 0),
word_count INT CHECK (word_count >= 0),
page0_height REAL CHECK (page0_height >= 0),
page0_width REAL CHECK (page0_width >= 0),
permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
pdf_created TIMESTAMP WITH TIME ZONE,
pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
metadata JSONB
-- maybe some analysis of available fields?
-- metadata JSON fields:
-- title
-- subject
-- author
-- creator
-- producer
-- CrossMarkDomains
-- doi
-- form
-- encrypted
);
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1),
created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
release_stage TEXT CHECK (octet_length(release_stage) >= 1),
request JSONB,
-- request isn't required, but can stash extra fields there for import, eg:
-- ext_ids (source/source_id sometimes enough)
-- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
-- edit_extra
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
hit BOOLEAN NOT NULL,
status TEXT CHECK (octet_length(terminal_url) >= 1),
terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
terminal_status_code INT,
terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40),
PRIMARY KEY (ingest_type, base_url)
);
CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
CREATE TABLE IF NOT EXISTS shadow (
shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
doi TEXT CHECK (octet_length(doi) >= 1),
pmid TEXT CHECK (octet_length(pmid) >= 1),
isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
PRIMARY KEY(shadow_corpus, shadow_id)
);
CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
|