aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_ingest.py')
-rw-r--r--python/tests/test_ingest.py257
1 files changed, 164 insertions, 93 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 46346b7..e14a452 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,12 @@
-
import json
+
import pytest
import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
@pytest.fixture
@@ -21,6 +21,7 @@ def ingest_worker(wayback_client, spn_client):
)
return worker
+
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
grobid_client = GrobidClient(
@@ -41,153 +42,223 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client):
@responses.activate
def test_ingest_success(ingest_worker_pdf):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
- responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
status=200,
- body=json.dumps([]))
- responses.add(responses.GET,
- 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
status=200,
- body=json.dumps([]))
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] == True
- assert resp['status'] == "success"
- assert resp['request'] == request
- assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
- assert type(resp['terminal']['terminal_dt']) == str
- assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
- assert resp['terminal']['terminal_status_code']
- assert type(resp['file_meta']['size_bytes']) == int
- assert resp['file_meta']['mimetype'] == "application/pdf"
- assert resp['cdx']['url'] == TARGET + "/redirect"
- assert 'warc_path' not in resp['cdx']
- assert 'revisit_cdx' not in resp
- assert resp['grobid']['status'] == "success"
- assert resp['grobid']['status_code'] == 200
- assert resp['grobid']['grobid_version']
- assert 'fatcat_release' in resp['grobid']
- assert 'grobid_version' not in resp['grobid']['metadata']
- assert 'fatcat_release' not in resp['grobid']['metadata']
- assert not 'tei_xml' in resp['grobid']
- assert resp['pdf_meta']['status'] == "success"
- assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
- assert resp['pdf_meta'].get('text') is None
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
+
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ body="<html></html>",
+ )
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] == False
- assert resp['status'] == "no-pdf-link"
- assert resp['request'] == request
- assert 'terminal' in resp
- assert 'file_meta' not in resp
- assert 'cdx' not in resp
- assert 'revisit_cdx' not in resp
- assert 'grobid' not in resp
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
+
@responses.activate
def test_ingest_blocklist(ingest_worker):
ingest_worker.base_url_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-url-blocklist"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
@responses.activate
def test_ingest_wall_blocklist(ingest_worker):
ingest_worker.wall_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-wall"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request