From 826c7538e091fac14d987a3cd654975da964e240 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 18:50:17 -0700 Subject: make fmt (black 21.9b0) --- python/tests/test_ingest.py | 273 ++++++++++++++++++++++++-------------------- 1 file changed, 147 insertions(+), 126 deletions(-) (limited to 'python/tests/test_ingest.py') diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 617f2b4..ad8c22e 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -11,7 +11,9 @@ from sandcrawler import * @pytest.fixture def ingest_worker(wayback_client, spn_client): - grobid_client = GrobidClient(host_url="http://dummy-grobid", ) + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) worker = IngestFileWorker( wayback_client=wayback_client, spn_client=spn_client, @@ -22,8 +24,12 @@ def ingest_worker(wayback_client, spn_client): @pytest.fixture def ingest_worker_pdf(wayback_client_pdf, spn_client): - grobid_client = GrobidClient(host_url="http://dummy-grobid", ) - pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", ) + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + pgrest_client = SandcrawlerPostgrestClient( + api_url="http://dummy-postgrest", + ) worker = IngestFileWorker( wayback_client=wayback_client_pdf, spn_client=spn_client, @@ -36,182 +42,197 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client): @responses.activate def test_ingest_success(ingest_worker_pdf): - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: pdf_bytes = f.read() request = { - 'ingest_type': 'pdf', - 'base_url': "http://dummy-host/", + "ingest_type": "pdf", + "base_url": "http://dummy-host/", } - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=pdf_bytes) - responses.add(responses.GET, - 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format( - "90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) - responses.add(responses.GET, - 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format( - "90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', - status=200, - body=REAL_TEI_XML, - content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=pdf_bytes, + ) + responses.add( + responses.GET, + "http://dummy-postgrest/grobid?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.GET, + "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) resp = ingest_worker_pdf.process(request) print(resp) - assert resp['hit'] is True - assert resp['status'] == "success" - assert resp['request'] == request - assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] - assert type(resp['terminal']['terminal_dt']) == str - assert resp['terminal']['terminal_url'] == TARGET + "/redirect" - assert resp['terminal']['terminal_status_code'] - assert type(resp['file_meta']['size_bytes']) == int - assert resp['file_meta']['mimetype'] == "application/pdf" - assert resp['cdx']['url'] == TARGET + "/redirect" - assert 'warc_path' not in resp['cdx'] - assert 'revisit_cdx' not in resp - assert resp['grobid']['status'] == "success" - assert resp['grobid']['status_code'] == 200 - assert resp['grobid']['grobid_version'] - assert 'fatcat_release' in resp['grobid'] - assert 'grobid_version' not in resp['grobid']['metadata'] - assert 'fatcat_release' not in resp['grobid']['metadata'] - assert 'tei_xml' not in resp['grobid'] - assert resp['pdf_meta']['status'] == "success" - assert resp['pdf_meta']['pdf_extra']['page_count'] == 1 - assert resp['pdf_meta'].get('text') is None + assert resp["hit"] is True + assert resp["status"] == "success" + assert resp["request"] == request + assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"] + assert type(resp["terminal"]["terminal_dt"]) == str + assert resp["terminal"]["terminal_url"] == TARGET + "/redirect" + assert resp["terminal"]["terminal_status_code"] + assert type(resp["file_meta"]["size_bytes"]) == int + assert resp["file_meta"]["mimetype"] == "application/pdf" + assert resp["cdx"]["url"] == TARGET + "/redirect" + assert "warc_path" not in resp["cdx"] + assert "revisit_cdx" not in resp + assert resp["grobid"]["status"] == "success" + assert resp["grobid"]["status_code"] == 200 + assert resp["grobid"]["grobid_version"] + assert "fatcat_release" in resp["grobid"] + assert "grobid_version" not in resp["grobid"]["metadata"] + assert "fatcat_release" not in resp["grobid"]["metadata"] + assert "tei_xml" not in resp["grobid"] + assert resp["pdf_meta"]["status"] == "success" + assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1 + assert resp["pdf_meta"].get("text") is None @responses.activate def test_ingest_landing(ingest_worker): request = { - 'ingest_type': 'pdf', - 'base_url': "http://dummy-host/", + "ingest_type": "pdf", + "base_url": "http://dummy-host/", } - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY, + ) # this is for second time around; don't want to fetch same landing page # HTML again and result in a loop - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body="") + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="", + ) resp = ingest_worker.process(request) print(resp) - assert resp['hit'] is False - assert resp['status'] == "no-pdf-link" - assert resp['request'] == request - assert 'terminal' in resp - assert 'file_meta' not in resp - assert 'cdx' not in resp - assert 'revisit_cdx' not in resp - assert 'grobid' not in resp + assert resp["hit"] is False + assert resp["status"] == "no-pdf-link" + assert resp["request"] == request + assert "terminal" in resp + assert "file_meta" not in resp + assert "cdx" not in resp + assert "revisit_cdx" not in resp + assert "grobid" not in resp @responses.activate def test_ingest_blocklist(ingest_worker): ingest_worker.base_url_blocklist = [ - '://test.fatcat.wiki/', + "://test.fatcat.wiki/", ] request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "skip-url-blocklist" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "skip-url-blocklist" + assert resp["request"] == request @responses.activate def test_ingest_wall_blocklist(ingest_worker): ingest_worker.wall_blocklist = [ - '://test.fatcat.wiki/', + "://test.fatcat.wiki/", ] request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "skip-wall" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "skip-wall" + assert resp["request"] == request @responses.activate def test_ingest_cookie_blocklist(ingest_worker): request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/cookieAbsent", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/cookieAbsent", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "blocked-cookie" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "blocked-cookie" + assert resp["request"] == request -- cgit v1.2.3