diff options
Diffstat (limited to 'python/tests/test_ingest.py')
-rw-r--r-- | python/tests/test_ingest.py | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py new file mode 100644 index 0000000..e14a452 --- /dev/null +++ b/python/tests/test_ingest.py @@ -0,0 +1,264 @@ +import json + +import pytest +import responses +from test_grobid import REAL_TEI_XML +from test_savepagenow import * +from test_wayback import * + +from sandcrawler import * + + +@pytest.fixture +def ingest_worker(wayback_client, spn_client): + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + worker = IngestFileWorker( + wayback_client=wayback_client, + spn_client=spn_client, + grobid_client=grobid_client, + ) + return worker + + +@pytest.fixture +def ingest_worker_pdf(wayback_client_pdf, spn_client): + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + pgrest_client = SandcrawlerPostgrestClient( + api_url="http://dummy-postgrest", + ) + worker = IngestFileWorker( + wayback_client=wayback_client_pdf, + spn_client=spn_client, + grobid_client=grobid_client, + pgrest_client=pgrest_client, + ) + return worker + + +@responses.activate +def test_ingest_success(ingest_worker_pdf): + + with open("tests/files/dummy.pdf", "rb") as f: + pdf_bytes = f.read() + + request = { + "ingest_type": "pdf", + "base_url": "http://dummy-host/", + } + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=pdf_bytes, + ) + responses.add( + responses.GET, + "http://dummy-postgrest/grobid?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.GET, + "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) + + resp = ingest_worker_pdf.process(request) + + print(resp) + assert resp["hit"] is True + assert resp["status"] == "success" + assert resp["request"] == request + assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"] + assert type(resp["terminal"]["terminal_dt"]) == str + assert resp["terminal"]["terminal_url"] == TARGET + "/redirect" + assert resp["terminal"]["terminal_status_code"] + assert type(resp["file_meta"]["size_bytes"]) == int + assert resp["file_meta"]["mimetype"] == "application/pdf" + assert resp["cdx"]["url"] == TARGET + "/redirect" + assert "warc_path" not in resp["cdx"] + assert "revisit_cdx" not in resp + assert resp["grobid"]["status"] == "success" + assert resp["grobid"]["status_code"] == 200 + assert resp["grobid"]["grobid_version"] + assert "fatcat_release" in resp["grobid"] + assert "grobid_version" not in resp["grobid"]["metadata"] + assert "fatcat_release" not in resp["grobid"]["metadata"] + assert "tei_xml" not in resp["grobid"] + assert resp["pdf_meta"]["status"] == "success" + assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1 + assert resp["pdf_meta"].get("text") is None + + +@responses.activate +def test_ingest_landing(ingest_worker): + + request = { + "ingest_type": "pdf", + "base_url": "http://dummy-host/", + } + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY, + ) + + # this is for second time around; don't want to fetch same landing page + # HTML again and result in a loop + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="<html></html>", + ) + + resp = ingest_worker.process(request) + + print(resp) + assert resp["hit"] is False + assert resp["status"] == "no-pdf-link" + assert resp["request"] == request + assert "terminal" in resp + assert "file_meta" not in resp + assert "cdx" not in resp + assert "revisit_cdx" not in resp + assert "grobid" not in resp + + +@responses.activate +def test_ingest_blocklist(ingest_worker): + + ingest_worker.base_url_blocklist = [ + "://test.fatcat.wiki/", + ] + request = { + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp["hit"] is False + assert resp["status"] == "skip-url-blocklist" + assert resp["request"] == request + + +@responses.activate +def test_ingest_wall_blocklist(ingest_worker): + + ingest_worker.wall_blocklist = [ + "://test.fatcat.wiki/", + ] + request = { + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp["hit"] is False + assert resp["status"] == "skip-wall" + assert resp["request"] == request + + +@responses.activate +def test_ingest_cookie_blocklist(ingest_worker): + + request = { + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/cookieAbsent", + } + + resp = ingest_worker.process(request) + + assert resp["hit"] is False + assert resp["status"] == "blocked-cookie" + assert resp["request"] == request |