diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-10-29 12:16:02 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 | 
| commit | d3fa74e941aa11f79cee2d0adcb5cbc70884ef48 (patch) | |
| tree | d9e50a33686ce678016f8f573d3d3995ed966a5f /python/tests/test_grobid.py | |
| parent | 13d2dc55d5be4d9579a4a242ce251cdbc82730aa (diff) | |
| download | sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.tar.gz sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.zip | |
initial crossref-refs via GROBID helper routine
Diffstat (limited to 'python/tests/test_grobid.py')
| -rw-r--r-- | python/tests/test_grobid.py | 132 | 
1 files changed, 131 insertions, 1 deletions
| diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index c086d73..dce64bc 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -1,3 +1,4 @@ +import json  import struct  import pytest @@ -41,7 +42,10 @@ def test_grobid_503(grobid_client):  @responses.activate -def test_grobid_success(grobid_client): +def test_grobid_success_iso_8859(grobid_client): +    """ +    This might have been the old GROBID behavior, with default encoding? Can't really remember. +    """      responses.add(          responses.POST, @@ -64,6 +68,27 @@ def test_grobid_success(grobid_client):  @responses.activate +def test_grobid_success(grobid_client): + +    responses.add( +        responses.POST, +        "http://dummy-grobid/api/processFulltextDocument", +        status=200, +        body=REAL_TEI_XML, +        content_type="application/xml; charset=UTF-8", +    ) + +    resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) + +    # grobid gets POST 1x times +    assert len(responses.calls) == 1 + +    assert resp["status_code"] == 200 +    assert resp["status"] == "success" +    assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8") + + +@responses.activate  def test_grobid_worker_cdx(grobid_client, wayback_client):  # noqa: F811      sink = BlackholeSink() @@ -90,3 +115,108 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):  # noqa: F811          assert pusher_counts["pushed"] == worker.counts["total"]      assert len(responses.calls) == worker.counts["total"] + + +@responses.activate +def test_grobid_refs_978(grobid_client): + +    with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f: +        crossref_work = json.loads(f.read()) + +    with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f: +        xml_bytes = f.read() +        assert "\u2013".encode("utf-8") in xml_bytes +        responses.add( +            responses.POST, +            "http://dummy-grobid/api/processCitationList", +            status=200, +            body=xml_bytes, +            content_type="application/xml; charset=UTF-8", +        ) + +    refs_row = grobid_client.crossref_refs(crossref_work) + +    # grobid gets POST 1x times +    assert len(responses.calls) == 1 + +    assert refs_row["source"] == "crossref" +    assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" +    assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" +    refs = refs_row["refs_json"] +    assert len(refs) == 3 +    assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"]) + +    # test case of no references +    crossref_work["message"]["reference"] = [] +    refs_row = grobid_client.crossref_refs(crossref_work) + +    assert refs_row["source"] == "crossref" +    assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" +    assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" +    assert len(refs_row["refs_json"]) == 0 + +    # test that 'message' works also +    refs_row = grobid_client.crossref_refs(crossref_work["message"]) +    assert refs_row["source"] == "crossref" +    assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4" +    assert refs_row["source_ts"] == "2021-05-10T22:08:45Z" +    assert len(refs_row["refs_json"]) == 0 + +    # grobid gets no additional POST from the above empty queries +    assert len(responses.calls) == 1 + + +@responses.activate +def test_grobid_refs_s104(grobid_client): + +    # test another file +    with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f: +        crossref_work = json.loads(f.read()) + +    with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f: +        responses.add( +            responses.POST, +            "http://dummy-grobid/api/processCitationList", +            status=200, +            body=f.read(), +            content_type="application/xml; charset=UTF-8", +        ) + +    refs_row = grobid_client.crossref_refs(crossref_work) + +    # GROBID gets one more POST +    assert len(responses.calls) == 1 + +    assert refs_row["source"] == "crossref" +    assert refs_row["source_id"] == "10.1017/s1047951103000064" +    assert refs_row["source_ts"] == "2021-06-10T05:35:02Z" +    refs = refs_row["refs_json"] +    assert len(refs) == 24 +    assert set([r["id"] for r in refs]) == set( +        [ +            "S1047951103000064_ref025", +            "S1047951103000064_ref013", +            "S1047951103000064_ref012", +            "S1047951103000064_ref041", +            "S1047951103000064_ref002", +            "S1047951103000064_ref043", +            "S1047951103000064_ref037", +            "S1047951103000064_ref035", +            "S1047951103000064_ref003", +            "S1047951103000064_ref005", +            "S1047951103000064_ref017", +            "S1047951103000064_ref016", +            "S1047951103000064_ref001", +            "S1047951103000064_ref039", +            "S1047951103000064_ref032", +            "S1047951103000064_ref014", +            "S1047951103000064_ref008", +            "S1047951103000064_ref038", +            "S1047951103000064_ref018", +            "S1047951103000064_ref027", +            "S1047951103000064_ref034", +            "S1047951103000064_ref044", +            "S1047951103000064_ref006", +            "S1047951103000064_ref030", +        ] +    ) | 
