diff options
Diffstat (limited to 'python/tests/test_ingest.py')
-rw-r--r-- | python/tests/test_ingest.py | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py new file mode 100644 index 0000000..b51f721 --- /dev/null +++ b/python/tests/test_ingest.py @@ -0,0 +1,207 @@ + +import json +import pytest +import responses + +from sandcrawler import * +from test_wayback import * +from test_savepagenow import * +from test_grobid import REAL_TEI_XML + + +@pytest.fixture +def ingest_worker(wayback_client, spn_client): + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + worker = IngestFileWorker( + wayback_client=wayback_client, + spn_client=spn_client, + grobid_client=grobid_client, + ) + return worker + +@pytest.fixture +def ingest_worker_pdf(wayback_client_pdf, spn_client): + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + pgrest_client = SandcrawlerPostgrestClient( + api_url="http://dummy-postgrest", + ) + worker = IngestFileWorker( + wayback_client=wayback_client_pdf, + spn_client=spn_client, + grobid_client=grobid_client, + pgrest_client=pgrest_client, + ) + return worker + + +@responses.activate +def test_ingest_success(ingest_worker_pdf): + + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + + request = { + 'ingest_type': 'pdf', + 'base_url': "http://dummy-host/", + } + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) + responses.add(responses.GET, + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=pdf_bytes) + responses.add(responses.GET, + 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"), + status=200, + body=json.dumps([])) + responses.add(responses.GET, + 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"), + status=200, + body=json.dumps([])) + responses.add(responses.POST, + 'http://dummy-grobid/api/processFulltextDocument', status=200, + body=REAL_TEI_XML, content_type='text/xml') + + resp = ingest_worker_pdf.process(request) + + print(resp) + assert resp['hit'] == True + assert resp['status'] == "success" + assert resp['request'] == request + assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] + assert type(resp['terminal']['terminal_dt']) == str + assert resp['terminal']['terminal_url'] == TARGET + "/redirect" + assert resp['terminal']['terminal_status_code'] + assert type(resp['file_meta']['size_bytes']) == int + assert resp['file_meta']['mimetype'] == "application/pdf" + assert resp['cdx']['url'] == TARGET + "/redirect" + assert 'warc_path' not in resp['cdx'] + assert 'revisit_cdx' not in resp + assert resp['grobid']['status'] == "success" + assert resp['grobid']['status_code'] == 200 + assert resp['grobid']['grobid_version'] + assert 'fatcat_release' in resp['grobid'] + assert 'grobid_version' not in resp['grobid']['metadata'] + assert 'fatcat_release' not in resp['grobid']['metadata'] + assert not 'tei_xml' in resp['grobid'] + assert resp['pdf_meta']['status'] == "success" + assert resp['pdf_meta']['pdf_extra']['page_count'] == 1 + assert resp['pdf_meta'].get('text') is None + +@responses.activate +def test_ingest_landing(ingest_worker): + + request = { + 'ingest_type': 'pdf', + 'base_url': "http://dummy-host/", + } + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) + responses.add(responses.GET, + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY) + + # this is for second time around; don't want to fetch same landing page + # HTML again and result in a loop + responses.add(responses.GET, + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="<html></html>") + + resp = ingest_worker.process(request) + + print(resp) + assert resp['hit'] == False + assert resp['status'] == "no-pdf-link" + assert resp['request'] == request + assert 'terminal' in resp + assert 'file_meta' not in resp + assert 'cdx' not in resp + assert 'revisit_cdx' not in resp + assert 'grobid' not in resp + +@responses.activate +def test_ingest_blocklist(ingest_worker): + + ingest_worker.base_url_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-url-blocklist" + assert resp['request'] == request + + +@responses.activate +def test_ingest_wall_blocklist(ingest_worker): + + ingest_worker.wall_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-wall" + assert resp['request'] == request + +@responses.activate +def test_ingest_cookie_blocklist(ingest_worker): + + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/cookieAbsent", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "blocked-cookie" + assert resp['request'] == request + |