aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_ingest.py')
-rw-r--r--python/tests/test_ingest.py207
1 files changed, 207 insertions, 0 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
new file mode 100644
index 0000000..b51f721
--- /dev/null
+++ b/python/tests/test_ingest.py
@@ -0,0 +1,207 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import *
+from test_wayback import *
+from test_savepagenow import *
+from test_grobid import REAL_TEI_XML
+
+
+@pytest.fixture
+def ingest_worker(wayback_client, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ )
+ return worker
+
+@pytest.fixture
+def ingest_worker_pdf(wayback_client_pdf, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ pgrest_client = SandcrawlerPostgrestClient(
+ api_url="http://dummy-postgrest",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client_pdf,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ pgrest_client=pgrest_client,
+ )
+ return worker
+
+
+@responses.activate
+def test_ingest_success(ingest_worker_pdf):
+
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ pdf_bytes = f.read()
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "http://dummy-host/",
+ }
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=pdf_bytes)
+ responses.add(responses.GET,
+ 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
+ responses.add(responses.GET,
+ 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
+ responses.add(responses.POST,
+ 'http://dummy-grobid/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ resp = ingest_worker_pdf.process(request)
+
+ print(resp)
+ assert resp['hit'] == True
+ assert resp['status'] == "success"
+ assert resp['request'] == request
+ assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
+ assert type(resp['terminal']['terminal_dt']) == str
+ assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
+ assert resp['terminal']['terminal_status_code']
+ assert type(resp['file_meta']['size_bytes']) == int
+ assert resp['file_meta']['mimetype'] == "application/pdf"
+ assert resp['cdx']['url'] == TARGET + "/redirect"
+ assert 'warc_path' not in resp['cdx']
+ assert 'revisit_cdx' not in resp
+ assert resp['grobid']['status'] == "success"
+ assert resp['grobid']['status_code'] == 200
+ assert resp['grobid']['grobid_version']
+ assert 'fatcat_release' in resp['grobid']
+ assert 'grobid_version' not in resp['grobid']['metadata']
+ assert 'fatcat_release' not in resp['grobid']['metadata']
+ assert not 'tei_xml' in resp['grobid']
+ assert resp['pdf_meta']['status'] == "success"
+ assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
+ assert resp['pdf_meta'].get('text') is None
+
+@responses.activate
+def test_ingest_landing(ingest_worker):
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "http://dummy-host/",
+ }
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY)
+
+ # this is for second time around; don't want to fetch same landing page
+ # HTML again and result in a loop
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>")
+
+ resp = ingest_worker.process(request)
+
+ print(resp)
+ assert resp['hit'] == False
+ assert resp['status'] == "no-pdf-link"
+ assert resp['request'] == request
+ assert 'terminal' in resp
+ assert 'file_meta' not in resp
+ assert 'cdx' not in resp
+ assert 'revisit_cdx' not in resp
+ assert 'grobid' not in resp
+
+@responses.activate
+def test_ingest_blocklist(ingest_worker):
+
+ ingest_worker.base_url_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-url-blocklist"
+ assert resp['request'] == request
+
+
+@responses.activate
+def test_ingest_wall_blocklist(ingest_worker):
+
+ ingest_worker.wall_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-wall"
+ assert resp['request'] == request
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "blocked-cookie"
+ assert resp['request'] == request
+