From d06fd45e3c86cb080ad7724f3fc7575750a9cd69 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Jan 2020 13:54:02 -0800 Subject: clarify ingest result schema and semantics --- python/sandcrawler/ingest.py | 15 +++++++++++---- python/tests/test_ingest.py | 22 +++++++++++++++++++--- python/tests/test_live_wayback.py | 2 ++ 3 files changed, 32 insertions(+), 7 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index de5e957..53c4ccf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -126,19 +126,21 @@ class IngestFileWorker(SandcrawlerWorker): assert result_row['hit'] existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) - if not (existing_file_meta and existing_grobid): + existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt']) + if not (existing_file_meta and existing_grobid and existing_cdx): raise NotImplementedError("partially-exsiting records not implemented yet") - # TODO: CDX result = { 'hit': result_row['hit'], 'status': "existing", 'request': request, 'grobid': existing_grobid, 'file_meta': existing_file_meta, + 'cdx': existing_cdx, 'terminal': { 'terminal_url': result_row['terminal_url'], 'terminal_dt': result_row['terminal_dt'], 'terminal_status_code': result_row['terminal_status_code'], + 'terminal_sha1hex': result_row['terminal_sha1hex'], }, } return result @@ -174,7 +176,9 @@ class IngestFileWorker(SandcrawlerWorker): if result['status'] == "success": metadata = self.grobid_client.metadata(result) if metadata: - result.update(metadata) + result['metadata'] = self.grobid_client.metadata(result) + result['fatcat_release'] = result['metadata'].pop('fatcat_release', None) + result['grobid_version'] = result['metadata'].pop('grobid_version', None) result.pop('tei_xml', None) result.pop('file_meta', None) result.pop('key', None) @@ -282,6 +286,7 @@ class IngestFileWorker(SandcrawlerWorker): "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, + "terminal_sha1hex": file_meta['sha1hex'], } # fetch must be a hit if we got this far (though not necessarily an ingest hit!) @@ -300,7 +305,9 @@ class IngestFileWorker(SandcrawlerWorker): if not (resource.hit and file_meta['mimetype'] == "application/pdf"): # protocols.io PDFs are "application/octet-stream" - if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): + if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): + pass + else: result['status'] = "wrong-mimetype" # formerly: "other-mimetype" return result diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 8f96a26..050e2ea 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -83,10 +83,22 @@ def test_ingest_success(ingest_worker_pdf): assert resp['hit'] == True assert resp['status'] == "success" assert resp['request'] == request - assert resp['file_meta']['size_bytes'] - assert resp['grobid'] + assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] + assert type(resp['terminal']['terminal_dt']) == str + assert resp['terminal']['terminal_url'] == TARGET + "/redirect" + assert resp['terminal']['terminal_status_code'] + assert type(resp['file_meta']['size_bytes']) == int + assert resp['file_meta']['mimetype'] == "application/pdf" + assert resp['cdx']['url'] == TARGET + "/redirect" + assert 'warc_path' not in resp['cdx'] + assert 'revisit_cdx' not in resp + assert resp['grobid']['status'] == "success" + assert resp['grobid']['status_code'] == 200 + assert resp['grobid']['grobid_version'] + assert 'fatcat_release' in resp['grobid'] + assert 'grobid_version' not in resp['grobid']['metadata'] + assert 'fatcat_release' not in resp['grobid']['metadata'] assert not 'tei_xml' in resp['grobid'] - assert resp['terminal'] @responses.activate def test_ingest_landing(ingest_worker): @@ -131,5 +143,9 @@ def test_ingest_landing(ingest_worker): assert resp['hit'] == False assert resp['status'] == "no-pdf-link" assert resp['request'] == request + assert 'terminal' in resp + assert 'file_meta' not in resp + assert 'cdx' not in resp + assert 'revisit_cdx' not in resp assert 'grobid' not in resp diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 4f7daef..429c6b0 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -132,6 +132,8 @@ def test_lookup_ftp(wayback_client): assert resp.terminal_url == url assert resp.terminal_status_code == 226 assert resp.cdx.url == url + assert resp.revisit_cdx + assert resp.revisit_cdx.url != url file_meta = gen_file_metadata(resp.body) assert file_meta['sha1hex'] == resp.cdx.sha1hex -- cgit v1.2.3