diff options
Diffstat (limited to 'python/tests/test_grobid.py')
-rw-r--r-- | python/tests/test_grobid.py | 199 |
1 files changed, 171 insertions, 28 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 36d90ef..dce64bc 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -1,17 +1,18 @@ +import json +import struct import pytest -import struct import responses +from test_wayback import cdx_client, wayback_client # noqa:F401 -from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient -from test_wayback import wayback_client, cdx_client - +from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) -with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: +with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f: REAL_TEI_XML = f.read() + @pytest.fixture def grobid_client(): client = GrobidClient( @@ -19,61 +20,203 @@ def grobid_client(): ) return client + @responses.activate def test_grobid_503(grobid_client): status = b'{"status": "done broke due to 503"}' - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=503, - body=status) + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=503, + body=status, + ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 - assert resp['status_code'] == 503 - assert resp['status'] == "error" + assert resp["status_code"] == 503 + assert resp["status"] == "error" + + +@responses.activate +def test_grobid_success_iso_8859(grobid_client): + """ + This might have been the old GROBID behavior, with default encoding? Can't really remember. + """ + + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) + + resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) + + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert resp["status_code"] == 200 + assert resp["status"] == "success" + # print(type(resp['tei_xml'])) + # print(type(REAL_TEI_XML)) + assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1") + @responses.activate def test_grobid_success(grobid_client): - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=200, - body=REAL_TEI_XML, content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="application/xml; charset=UTF-8", + ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 - assert resp['status_code'] == 200 - assert resp['status'] == "success" - #print(type(resp['tei_xml'])) - #print(type(REAL_TEI_XML)) - assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1') + assert resp["status_code"] == 200 + assert resp["status"] == "success" + assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8") + @responses.activate -def test_grobid_worker_cdx(grobid_client, wayback_client): +def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811 sink = BlackholeSink() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=200, - body=REAL_TEI_XML, content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) - with open('tests/files/example.cdx', 'r') as cdx_file: + with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher( worker, cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher_counts = pusher.run() - assert pusher_counts['total'] - assert pusher_counts['pushed'] == 7 - assert pusher_counts['pushed'] == worker.counts['total'] + assert pusher_counts["total"] + assert pusher_counts["pushed"] == 7 + assert pusher_counts["pushed"] == worker.counts["total"] + + assert len(responses.calls) == worker.counts["total"] + + +@responses.activate +def test_grobid_refs_978(grobid_client): + + with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f: + crossref_work = json.loads(f.read()) + + with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f: + xml_bytes = f.read() + assert "\u2013".encode("utf-8") in xml_bytes + responses.add( + responses.POST, + "http://dummy-grobid/api/processCitationList", + status=200, + body=xml_bytes, + content_type="application/xml; charset=UTF-8", + ) - assert len(responses.calls) == worker.counts['total'] + refs_row = grobid_client.crossref_refs(crossref_work) + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert refs_row["source"] == "crossref" + assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" + assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" + refs = refs_row["refs_json"] + assert len(refs) == 3 + assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"]) + + # test case of no references + crossref_work["message"]["reference"] = [] + refs_row = grobid_client.crossref_refs(crossref_work) + + assert refs_row["source"] == "crossref" + assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" + assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" + assert len(refs_row["refs_json"]) == 0 + + # test that 'message' works also + refs_row = grobid_client.crossref_refs(crossref_work["message"]) + assert refs_row["source"] == "crossref" + assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" + assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" + assert len(refs_row["refs_json"]) == 0 + + # grobid gets no additional POST from the above empty queries + assert len(responses.calls) == 1 + + +@responses.activate +def test_grobid_refs_s104(grobid_client): + + # test another file + with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f: + crossref_work = json.loads(f.read()) + + with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f: + responses.add( + responses.POST, + "http://dummy-grobid/api/processCitationList", + status=200, + body=f.read(), + content_type="application/xml; charset=UTF-8", + ) + + refs_row = grobid_client.crossref_refs(crossref_work) + + # GROBID gets one more POST + assert len(responses.calls) == 1 + + assert refs_row["source"] == "crossref" + assert refs_row["source_id"] == "10.1017/s1047951103000064" + assert refs_row["source_ts"] == "2021-06-10T05:35:02Z" + refs = refs_row["refs_json"] + assert len(refs) == 24 + assert set([r["id"] for r in refs]) == set( + [ + "S1047951103000064_ref025", + "S1047951103000064_ref013", + "S1047951103000064_ref012", + "S1047951103000064_ref041", + "S1047951103000064_ref002", + "S1047951103000064_ref043", + "S1047951103000064_ref037", + "S1047951103000064_ref035", + "S1047951103000064_ref003", + "S1047951103000064_ref005", + "S1047951103000064_ref017", + "S1047951103000064_ref016", + "S1047951103000064_ref001", + "S1047951103000064_ref039", + "S1047951103000064_ref032", + "S1047951103000064_ref014", + "S1047951103000064_ref008", + "S1047951103000064_ref038", + "S1047951103000064_ref018", + "S1047951103000064_ref027", + "S1047951103000064_ref034", + "S1047951103000064_ref044", + "S1047951103000064_ref006", + "S1047951103000064_ref030", + ] + ) |