aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
commitd06fd45e3c86cb080ad7724f3fc7575750a9cd69 (patch)
tree12862548aa14870af3c710076a6df8441f2ddb4e
parent4d0224f3e73315ef4db39643e6d4851e4a466658 (diff)
downloadsandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.tar.gz
sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.zip
clarify ingest result schema and semantics
-rw-r--r--proposals/2019_ingest.md57
-rw-r--r--python/sandcrawler/ingest.py15
-rw-r--r--python/tests/test_ingest.py22
-rw-r--r--python/tests/test_live_wayback.py2
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql16
5 files changed, 82 insertions, 30 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index 751532a..0b569b0 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -112,29 +112,40 @@ HTML? Or both? Let's just recrawl.
- ...
*FileIngestResult*
- - request (object): the full IngestRequest, copied
- - terminal
- - url
- - status_code
- - wayback (XXX: ?)
- - datetime
- - archive_url
- - file_meta (same schema as sandcrawler-db table)
- - size_bytes
- - md5
- - sha1
- - sha256
- - mimetype
- - cdx (same schema as sandcrawler-db table)
- - grobid (same schema as sandcrawler-db table)
- - status
- - grobid_version
- - status_code
- - xml_url
- - fatcat_release (via biblio-glutton match)
- - metadata (JSON)
- - status (slug): 'success', 'error', etc
- - hit (boolean): whether we got something that looks like what was requested
+ - `request` (object): the full IngestRequest, copied
+ - `status` (slug): 'success', 'error', etc
+ - `hit` (boolean): whether we got something that looks like what was requested
+ - `terminal` (object): last crawled resource (if any)
+ - `terminal_url` (string; formerly `url`)
+ - `terminal_dt` (string): wayback capture datetime (string)
+ - `terminal_status_code`
+ - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1)
+ (in case of transport encoding difference)
+ - `file_meta` (object): info about the terminal file
+ - same schema as sandcrawler-db table
+ - `size_bytes`
+ - `md5hex`
+ - `sha1hex`
+ - `sha256hex`
+ - `mimetype`: if not know, `application/octet-stream`
+ - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial
+ record (eg, if via SPNv2)
+ - same schema as sandcrawler-db table
+ - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the
+ best "original" location for retrieval of the body (matching `flie_meta`)
+ - same schema as sandcrawler-db table
+ - `grobid`
+ - same schema as sandcrawler-db table
+ - `status` (string)
+ - `status_code` (int)
+ - `grobid_version` (string, from metadata)
+ - `fatcat_release` (string, from metadata)
+ - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed)
+ - NOT `tei_xml` (strip from reply)
+ - NOT `file_meta` (strip from reply)
+
+In general, it is the `terminal_dt` and `terminal_url` that should be used to
+construct wayback links (eg, for insertion to fatcat), not from the `cdx`.
## New SQL Tables
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index de5e957..53c4ccf 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -126,19 +126,21 @@ class IngestFileWorker(SandcrawlerWorker):
assert result_row['hit']
existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- if not (existing_file_meta and existing_grobid):
+ existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
+ if not (existing_file_meta and existing_grobid and existing_cdx):
raise NotImplementedError("partially-exsiting records not implemented yet")
- # TODO: CDX
result = {
'hit': result_row['hit'],
'status': "existing",
'request': request,
'grobid': existing_grobid,
'file_meta': existing_file_meta,
+ 'cdx': existing_cdx,
'terminal': {
'terminal_url': result_row['terminal_url'],
'terminal_dt': result_row['terminal_dt'],
'terminal_status_code': result_row['terminal_status_code'],
+ 'terminal_sha1hex': result_row['terminal_sha1hex'],
},
}
return result
@@ -174,7 +176,9 @@ class IngestFileWorker(SandcrawlerWorker):
if result['status'] == "success":
metadata = self.grobid_client.metadata(result)
if metadata:
- result.update(metadata)
+ result['metadata'] = self.grobid_client.metadata(result)
+ result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
+ result['grobid_version'] = result['metadata'].pop('grobid_version', None)
result.pop('tei_xml', None)
result.pop('file_meta', None)
result.pop('key', None)
@@ -282,6 +286,7 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_url": resource.terminal_url,
"terminal_dt": resource.terminal_dt,
"terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta['sha1hex'],
}
# fetch must be a hit if we got this far (though not necessarily an ingest hit!)
@@ -300,7 +305,9 @@ class IngestFileWorker(SandcrawlerWorker):
if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
# protocols.io PDFs are "application/octet-stream"
- if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+ if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+ pass
+ else:
result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
return result
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8f96a26..050e2ea 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -83,10 +83,22 @@ def test_ingest_success(ingest_worker_pdf):
assert resp['hit'] == True
assert resp['status'] == "success"
assert resp['request'] == request
- assert resp['file_meta']['size_bytes']
- assert resp['grobid']
+ assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
+ assert type(resp['terminal']['terminal_dt']) == str
+ assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
+ assert resp['terminal']['terminal_status_code']
+ assert type(resp['file_meta']['size_bytes']) == int
+ assert resp['file_meta']['mimetype'] == "application/pdf"
+ assert resp['cdx']['url'] == TARGET + "/redirect"
+ assert 'warc_path' not in resp['cdx']
+ assert 'revisit_cdx' not in resp
+ assert resp['grobid']['status'] == "success"
+ assert resp['grobid']['status_code'] == 200
+ assert resp['grobid']['grobid_version']
+ assert 'fatcat_release' in resp['grobid']
+ assert 'grobid_version' not in resp['grobid']['metadata']
+ assert 'fatcat_release' not in resp['grobid']['metadata']
assert not 'tei_xml' in resp['grobid']
- assert resp['terminal']
@responses.activate
def test_ingest_landing(ingest_worker):
@@ -131,5 +143,9 @@ def test_ingest_landing(ingest_worker):
assert resp['hit'] == False
assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
+ assert 'terminal' in resp
+ assert 'file_meta' not in resp
+ assert 'cdx' not in resp
+ assert 'revisit_cdx' not in resp
assert 'grobid' not in resp
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 4f7daef..429c6b0 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -132,6 +132,8 @@ def test_lookup_ftp(wayback_client):
assert resp.terminal_url == url
assert resp.terminal_status_code == 226
assert resp.cdx.url == url
+ assert resp.revisit_cdx
+ assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
assert file_meta['sha1hex'] == resp.cdx.sha1hex
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 12ed409..0b2b19c 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -1,10 +1,24 @@
+-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
+-- records are implied to be 200 status (or 226 for ftp); either direct hits or
+-- revisits
+-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
+-- datetimes. import scripts should take efforts to reduce this sort of
+-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
+-- all ingest result url/dt pairs should be included though.
+-- any mimetype is allowed, but presumption should be that actual body is full
+-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
+-- only a part of work). URLs that are parts of a fileset are allowed.
CREATE TABLE IF NOT EXISTS cdx (
url TEXT NOT NULL CHECK (octet_length(url) >= 1),
datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ -- sha1hex/cdx_sha1hex difference is intended to help with difference between
+ -- CDX hash (which is transport encoded body) vs. actual body. Probably need to
+ -- include both for all records?
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ -- TODO: enforce that only paths with '/' (item+file) should be included?
warc_path TEXT CHECK (octet_length(warc_path) >= 1),
warc_csize BIGINT,
warc_offset BIGINT,
@@ -12,8 +26,10 @@ CREATE TABLE IF NOT EXISTS cdx (
PRIMARY KEY(url, datetime)
);
CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+-- TODO: remove this index? not currently used
CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+-- TODO: require all fields. if mimetype unknown, should be octet-stream
CREATE TABLE IF NOT EXISTS file_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
sha256hex TEXT CHECK (octet_length(sha256hex) = 64),