aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
commitd06fd45e3c86cb080ad7724f3fc7575750a9cd69 (patch)
tree12862548aa14870af3c710076a6df8441f2ddb4e /python
parent4d0224f3e73315ef4db39643e6d4851e4a466658 (diff)
downloadsandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.tar.gz
sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.zip
clarify ingest result schema and semantics
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py15
-rw-r--r--python/tests/test_ingest.py22
-rw-r--r--python/tests/test_live_wayback.py2
3 files changed, 32 insertions, 7 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index de5e957..53c4ccf 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -126,19 +126,21 @@ class IngestFileWorker(SandcrawlerWorker):
assert result_row['hit']
existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- if not (existing_file_meta and existing_grobid):
+ existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
+ if not (existing_file_meta and existing_grobid and existing_cdx):
raise NotImplementedError("partially-exsiting records not implemented yet")
- # TODO: CDX
result = {
'hit': result_row['hit'],
'status': "existing",
'request': request,
'grobid': existing_grobid,
'file_meta': existing_file_meta,
+ 'cdx': existing_cdx,
'terminal': {
'terminal_url': result_row['terminal_url'],
'terminal_dt': result_row['terminal_dt'],
'terminal_status_code': result_row['terminal_status_code'],
+ 'terminal_sha1hex': result_row['terminal_sha1hex'],
},
}
return result
@@ -174,7 +176,9 @@ class IngestFileWorker(SandcrawlerWorker):
if result['status'] == "success":
metadata = self.grobid_client.metadata(result)
if metadata:
- result.update(metadata)
+ result['metadata'] = self.grobid_client.metadata(result)
+ result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
+ result['grobid_version'] = result['metadata'].pop('grobid_version', None)
result.pop('tei_xml', None)
result.pop('file_meta', None)
result.pop('key', None)
@@ -282,6 +286,7 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_url": resource.terminal_url,
"terminal_dt": resource.terminal_dt,
"terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta['sha1hex'],
}
# fetch must be a hit if we got this far (though not necessarily an ingest hit!)
@@ -300,7 +305,9 @@ class IngestFileWorker(SandcrawlerWorker):
if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
# protocols.io PDFs are "application/octet-stream"
- if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+ if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+ pass
+ else:
result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
return result
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8f96a26..050e2ea 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -83,10 +83,22 @@ def test_ingest_success(ingest_worker_pdf):
assert resp['hit'] == True
assert resp['status'] == "success"
assert resp['request'] == request
- assert resp['file_meta']['size_bytes']
- assert resp['grobid']
+ assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
+ assert type(resp['terminal']['terminal_dt']) == str
+ assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
+ assert resp['terminal']['terminal_status_code']
+ assert type(resp['file_meta']['size_bytes']) == int
+ assert resp['file_meta']['mimetype'] == "application/pdf"
+ assert resp['cdx']['url'] == TARGET + "/redirect"
+ assert 'warc_path' not in resp['cdx']
+ assert 'revisit_cdx' not in resp
+ assert resp['grobid']['status'] == "success"
+ assert resp['grobid']['status_code'] == 200
+ assert resp['grobid']['grobid_version']
+ assert 'fatcat_release' in resp['grobid']
+ assert 'grobid_version' not in resp['grobid']['metadata']
+ assert 'fatcat_release' not in resp['grobid']['metadata']
assert not 'tei_xml' in resp['grobid']
- assert resp['terminal']
@responses.activate
def test_ingest_landing(ingest_worker):
@@ -131,5 +143,9 @@ def test_ingest_landing(ingest_worker):
assert resp['hit'] == False
assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
+ assert 'terminal' in resp
+ assert 'file_meta' not in resp
+ assert 'cdx' not in resp
+ assert 'revisit_cdx' not in resp
assert 'grobid' not in resp
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 4f7daef..429c6b0 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -132,6 +132,8 @@ def test_lookup_ftp(wayback_client):
assert resp.terminal_url == url
assert resp.terminal_status_code == 226
assert resp.cdx.url == url
+ assert resp.revisit_cdx
+ assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
assert file_meta['sha1hex'] == resp.cdx.sha1hex