import json import struct import pytest import responses from test_wayback import cdx_client, wayback_client # noqa:F401 from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f: REAL_TEI_XML = f.read() @pytest.fixture def grobid_client(): client = GrobidClient( host_url="http://dummy-grobid", ) return client @responses.activate def test_grobid_503(grobid_client): status = b'{"status": "done broke due to 503"}' responses.add( responses.POST, "http://dummy-grobid/api/processFulltextDocument", status=503, body=status, ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 assert resp["status_code"] == 503 assert resp["status"] == "error" @responses.activate def test_grobid_success_iso_8859(grobid_client): """ This might have been the old GROBID behavior, with default encoding? Can't really remember. """ responses.add( responses.POST, "http://dummy-grobid/api/processFulltextDocument", status=200, body=REAL_TEI_XML, content_type="text/xml", ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 assert resp["status_code"] == 200 assert resp["status"] == "success" # print(type(resp['tei_xml'])) # print(type(REAL_TEI_XML)) assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1") @responses.activate def test_grobid_success(grobid_client): responses.add( responses.POST, "http://dummy-grobid/api/processFulltextDocument", status=200, body=REAL_TEI_XML, content_type="application/xml; charset=UTF-8", ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 assert resp["status_code"] == 200 assert resp["status"] == "success" assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8") @responses.activate def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811 sink = BlackholeSink() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) responses.add( responses.POST, "http://dummy-grobid/api/processFulltextDocument", status=200, body=REAL_TEI_XML, content_type="text/xml", ) with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher( worker, cdx_file, filter_http_statuses=[200, 226], filter_mimetypes=["application/pdf"], ) pusher_counts = pusher.run() assert pusher_counts["total"] assert pusher_counts["pushed"] == 7 assert pusher_counts["pushed"] == worker.counts["total"] assert len(responses.calls) == worker.counts["total"] @responses.activate def test_grobid_refs_978(grobid_client): with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f: crossref_work = json.loads(f.read()) with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f: xml_bytes = f.read() assert "\u2013".encode("utf-8") in xml_bytes responses.add( responses.POST, "http://dummy-grobid/api/processCitationList", status=200, body=xml_bytes, content_type="application/xml; charset=UTF-8", ) refs_row = grobid_client.crossref_refs(crossref_work) # grobid gets POST 1x times assert len(responses.calls) == 1 assert refs_row["source"] == "crossref" assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" refs = refs_row["refs_json"] assert len(refs) == 3 assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"]) # test case of no references crossref_work["message"]["reference"] = [] refs_row = grobid_client.crossref_refs(crossref_work) assert refs_row["source"] == "crossref" assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" assert len(refs_row["refs_json"]) == 0 # test that 'message' works also refs_row = grobid_client.crossref_refs(crossref_work["message"]) assert refs_row["source"] == "crossref" assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" assert len(refs_row["refs_json"]) == 0 # grobid gets no additional POST from the above empty queries assert len(responses.calls) == 1 @responses.activate def test_grobid_refs_s104(grobid_client): # test another file with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f: crossref_work = json.loads(f.read()) with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f: responses.add( responses.POST, "http://dummy-grobid/api/processCitationList", status=200, body=f.read(), content_type="application/xml; charset=UTF-8", ) refs_row = grobid_client.crossref_refs(crossref_work) # GROBID gets one more POST assert len(responses.calls) == 1 assert refs_row["source"] == "crossref" assert refs_row["source_id"] == "10.1017/s1047951103000064" assert refs_row["source_ts"] == "2021-06-10T05:35:02Z" refs = refs_row["refs_json"] assert len(refs) == 24 assert set([r["id"] for r in refs]) == set( [ "S1047951103000064_ref025", "S1047951103000064_ref013", "S1047951103000064_ref012", "S1047951103000064_ref041", "S1047951103000064_ref002", "S1047951103000064_ref043", "S1047951103000064_ref037", "S1047951103000064_ref035", "S1047951103000064_ref003", "S1047951103000064_ref005", "S1047951103000064_ref017", "S1047951103000064_ref016", "S1047951103000064_ref001", "S1047951103000064_ref039", "S1047951103000064_ref032", "S1047951103000064_ref014", "S1047951103000064_ref008", "S1047951103000064_ref038", "S1047951103000064_ref018", "S1047951103000064_ref027", "S1047951103000064_ref034", "S1047951103000064_ref044", "S1047951103000064_ref006", "S1047951103000064_ref030", ] )