aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_grobid.py')
-rw-r--r--python/tests/test_grobid.py132
1 files changed, 131 insertions, 1 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index c086d73..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,3 +1,4 @@
+import json
import struct
import pytest
@@ -41,7 +42,10 @@ def test_grobid_503(grobid_client):
@responses.activate
-def test_grobid_success(grobid_client):
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
responses.add(
responses.POST,
@@ -64,6 +68,27 @@ def test_grobid_success(grobid_client):
@responses.activate
+def test_grobid_success(grobid_client):
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
+
+@responses.activate
def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
@@ -90,3 +115,108 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
assert pusher_counts["pushed"] == worker.counts["total"]
assert len(responses.calls) == worker.counts["total"]
+
+
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )