1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
|
-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
-- records are implied to be 200 status (or 226 for ftp); either direct hits or
-- revisits
-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
-- datetimes. import scripts should take efforts to reduce this sort of
-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
-- all ingest result url/dt pairs should be included though.
-- any mimetype is allowed, but presumption should be that actual body is full
-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
-- only a part of work). URLs that are parts of a fileset are allowed.
CREATE TABLE IF NOT EXISTS cdx (
url TEXT NOT NULL CHECK (octet_length(url) >= 1),
datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
-- sha1hex/cdx_sha1hex difference is intended to help with difference between
-- CDX hash (which is transport encoded body) vs. actual body. Probably need to
-- include both for all records?
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
mimetype TEXT CHECK (octet_length(mimetype) >= 1),
-- TODO: enforce that only paths with '/' (item+file) should be included?
warc_path TEXT CHECK (octet_length(warc_path) >= 1),
warc_csize BIGINT,
warc_offset BIGINT,
row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
PRIMARY KEY(url, datetime)
);
CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
-- TODO: remove this index? not currently used
CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
-- TODO: require all fields. if mimetype unknown, should be octet-stream
CREATE TABLE IF NOT EXISTS file_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
md5hex TEXT CHECK (octet_length(md5hex) = 32),
size_bytes BIGINT,
mimetype TEXT CHECK (octet_length(mimetype) >= 1)
);
CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
CREATE TABLE IF NOT EXISTS fatcat_file (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
file_ident TEXT CHECK (octet_length(file_ident) = 26),
first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
);
CREATE TABLE IF NOT EXISTS petabox (
item TEXT NOT NULL CHECK (octet_length(item) >= 1),
path TEXT NOT NULL CHECK (octet_length(path) >= 1),
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
PRIMARY KEY(item, path)
);
CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
CREATE TABLE IF NOT EXISTS grobid (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
status_code INT NOT NULL,
status TEXT CHECK (octet_length(status) >= 1),
fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
-- extracted basic biblio metadata:
-- title
-- authors[]
-- full/display
-- given_name
-- surname
-- affiliation
-- year
-- journal_issn
-- journal_name
-- refs_count
metadata JSONB
);
-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
CREATE TABLE IF NOT EXISTS pdftrio (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status_code INT NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
models_date DATE,
ensemble_score REAL,
bert_score REAL,
linear_score REAL,
image_score REAL
);
CREATE TABLE IF NOT EXISTS pdf_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
has_page0_thumbnail BOOLEAN NOT NULL,
page_count INT CHECK (page_count >= 0),
word_count INT CHECK (word_count >= 0),
page0_height REAL CHECK (page0_height >= 0),
page0_width REAL CHECK (page0_width >= 0),
permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
pdf_created TIMESTAMP WITH TIME ZONE,
pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
metadata JSONB
-- maybe some analysis of available fields?
-- metadata JSON fields:
-- title
-- subject
-- author
-- creator
-- producer
-- CrossMarkDomains
-- doi
-- form
-- encrypted
);
CREATE TABLE IF NOT EXISTS html_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
scope TEXT CHECK (octet_length(status) >= 1),
has_teixml BOOLEAN NOT NULL,
has_thumbnail BOOLEAN NOT NULL,
word_count INT CHECK (word_count >= 0),
biblio JSONB,
resources JSONB
-- biblio JSON fields are similar to fatcat release schema
-- resources JSON object is a list of objects with keys like webcapture CDX schema
);
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1),
created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
release_stage TEXT CHECK (octet_length(release_stage) >= 1),
request JSONB,
-- request isn't required, but can stash extra fields there for import, eg:
-- ext_ids (source/source_id sometimes enough)
-- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
-- edit_extra
-- ingest type can be: pdf, xml, html
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
hit BOOLEAN NOT NULL,
status TEXT CHECK (octet_length(status) >= 1),
terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
terminal_status_code INT,
terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40),
PRIMARY KEY (ingest_type, base_url)
);
CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
hit BOOLEAN NOT NULL,
status TEXT CHECK (octet_length(status) >= 1),
platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
total_size BIGINT,
file_count BIGINT,
archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
manifest JSONB,
-- list, similar to fatcat fileset manifest, plus extra:
-- status (str)
-- path (str)
-- size (int)
-- md5 (str)
-- sha1 (str)
-- sha256 (str)
-- mimetype (str)
-- extra (dict)
-- platform_url (str)
-- terminal_url (str)
-- terminal_dt (str)
PRIMARY KEY (ingest_type, base_url)
);
CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
CREATE TABLE IF NOT EXISTS shadow (
shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
doi TEXT CHECK (octet_length(doi) >= 1),
pmid TEXT CHECK (octet_length(pmid) >= 1),
isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
PRIMARY KEY(shadow_corpus, shadow_id)
);
CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
CREATE TABLE IF NOT EXISTS crossref (
doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
indexed TIMESTAMP WITH TIME ZONE NOT NULL,
record JSON NOT NULL,
PRIMARY KEY(doi)
);
CREATE TABLE IF NOT EXISTS grobid_refs (
source TEXT NOT NULL CHECK (octet_length(source) >= 1),
source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
source_ts TIMESTAMP WITH TIME ZONE,
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
refs_json JSONB NOT NULL,
PRIMARY KEY(source, source_id)
);
CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
SELECT
crossref.doi as doi,
crossref.indexed as indexed,
crossref.record as record,
grobid_refs.source_ts as source_ts,
grobid_refs.refs_json as refs_json
FROM crossref
LEFT JOIN grobid_refs ON
grobid_refs.source_id = crossref.doi
AND grobid_refs.source = 'crossref';
|