aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-29 12:16:02 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commitd3fa74e941aa11f79cee2d0adcb5cbc70884ef48 (patch)
treed9e50a33686ce678016f8f573d3d3995ed966a5f /python
parent13d2dc55d5be4d9579a4a242ce251cdbc82730aa (diff)
downloadsandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.tar.gz
sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.zip
initial crossref-refs via GROBID helper routine
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid_tool.py21
-rw-r--r--python/sandcrawler/grobid.py125
-rw-r--r--python/tests/files/crossref_api_work_978-3-030-64953-1_4.json1
-rw-r--r--python/tests/files/crossref_api_work_s1047951103000064.json1
-rw-r--r--python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml66
-rw-r--r--python/tests/files/grobid_refs_s1047951103000064.tei.xml499
-rw-r--r--python/tests/test_grobid.py132
7 files changed, 839 insertions, 6 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index f99a78b..782bc13 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -15,6 +15,7 @@ import sys
from grobid_tei_xml import parse_document_xml
from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
def run_extract_json(args):
@@ -84,6 +85,13 @@ def run_transform(args):
print(json.dumps(out))
+def run_parse_crossref_refs(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
@@ -101,7 +109,7 @@ def main():
"-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
)
parser.add_argument(
- "--grobid-host", default="http://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
)
subparsers = parser.add_subparsers()
@@ -133,6 +141,17 @@ def main():
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+ sub_parse_crossref_refs = subparsers.add_parser(
+ "parse-crossref-refs",
+ help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+ )
+ sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+ sub_parse_crossref_refs.add_argument(
+ "json_file",
+ help="JSON-L file to process (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
sub_transform = subparsers.add_parser("transform")
sub_transform.set_defaults(func=run_transform)
sub_transform.add_argument(
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 37c4ea1..cdd2093 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
+import sys
import requests
-from grobid_tei_xml import parse_document_xml
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
from .ia import WaybackClient
from .misc import gen_file_metadata
@@ -9,7 +10,7 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class GrobidClient(object):
- def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
@@ -67,6 +68,29 @@ class GrobidClient(object):
info["error_msg"] = grobid_response.text[:10000]
return info
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
+
+ try:
+ grobid_response = requests.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
if result["status"] != "success":
return None
@@ -90,6 +114,87 @@ class GrobidClient(object):
meta[k] = tei_json[k]
return meta
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
+
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "")) <= 6:
+ return False
+
+ # TODO: what other combinations are enough to skip parsing?
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncatin very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ refs = self.process_citation_list([r["unstructured"] for r in unstructured_refs])
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ assert refs[i].unstructured == unstructured_refs[i]["unstructured"]
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
class GrobidWorker(SandcrawlerFetchWorker):
def __init__(
@@ -97,7 +202,7 @@ class GrobidWorker(SandcrawlerFetchWorker):
grobid_client: GrobidClient,
wayback_client: Optional[WaybackClient] = None,
sink: Optional[SandcrawlerWorker] = None,
- **kwargs
+ **kwargs,
):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
@@ -129,6 +234,18 @@ class GrobidWorker(SandcrawlerFetchWorker):
return result
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ return self.grobid_client.crossref_refs(record)
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,
diff --git a/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
new file mode 100644
index 0000000..54d07db
--- /dev/null
+++ b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T22:08:45Z","timestamp":1620684525878},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-64953-1_4","type":"book-chapter","created":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T02:57:20Z","timestamp":1610593040000},"page":"53-71","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Knowledge and Mathematical Objects"],"prefix":"10.1007","author":[{"given":"Lars-G\u00f6ran","family":"Johansson","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,14]]},"reference":[{"key":"4_CR12","doi-asserted-by":"publisher","volume-title":"Deflating existential consequence: A case for nominalism","author":"J Azzouni","year":"2004","unstructured":"Azzouni, J. (2004). Deflating existential consequence: A case for nominalism. New York: Oxford University Press.","DOI":"10.1093\/0195159888.001.0001"},{"key":"4_CR23","doi-asserted-by":"publisher","volume-title":"Foundations of constructive mathematics","author":"M Beeson","year":"1985","unstructured":"Beeson, M. (1985). Foundations of constructive mathematics. Berlin\/Heidelberg: Springer.","DOI":"10.1007\/978-3-642-68952-9"},{"issue":"2","key":"4_CR27","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1093\/philmat\/11.2.176","volume":"11","author":"H Billinge","year":"2003","unstructured":"Billinge, H. (2003). Did bishop have a philosophy of mathematics? Philosophica Mathematica, 11(2), 176\u2013194.","journal-title":"Philosophica Mathematica"},{"key":"4_CR29","doi-asserted-by":"publisher","volume-title":"Constructive analysis","author":"E Bishop","year":"1985","unstructured":"Bishop, E., & Bridges, D. S. (1985). Constructive analysis. Berlin: Springer.","DOI":"10.1007\/978-3-642-61667-9"},{"key":"4_CR37","series-title":"In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.)","volume-title":"Nominalism in the philosophy of mathematics","author":"O Bueno","year":"2014","unstructured":"Bueno, O. (2014). Nominalism in the philosophy of mathematics. In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.). Metaphysics Research Lab, Stanford University."},{"key":"4_CR38","volume-title":"Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen","author":"G Cantor","year":"1883","unstructured":"Cantor, G. (1883). Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen. Leipzig: Teubner."},{"key":"4_CR60","volume-title":"The seas of language","author":"M Dummett","year":"1993","unstructured":"Dummett, M. (1993). The seas of language. Oxford: Clarendon Press."},{"key":"4_CR73","volume-title":"In the light of logic","author":"S Feferman","year":"1998","unstructured":"Feferman, S. (1998). In the light of logic. New York: Oxford University Press."},{"key":"4_CR74","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1093\/0195148770.003.0019","volume-title":"The Oxford handbook of philosophy of mathematics and logic","author":"S Feferman","year":"2005","unstructured":"Feferman, S. (2005). Predicativity. In S. Shapiro (Ed.), The Oxford handbook of philosophy of mathematics and logic (pp. 590\u2013624). New York\/Oxford: Oxford University Press."},{"key":"4_CR77","volume-title":"Science without numbers: A defence of nominalism","author":"H H Field","year":"1980","unstructured":"Field, H. H. (1980). Science without numbers: A defence of nominalism. Oxford: Blackwell."},{"key":"4_CR88","volume-title":"Werke, volume 8","author":"C F Gauss","year":"2011","unstructured":"Gauss, C. F. (2011). Werke, volume 8. Cambridge: Cambridge University Press."},{"key":"4_CR93","unstructured":"Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155\u2013172). Bobs-Merrill company."},{"key":"4_CR103","volume-title":"Mathematics without numbers: Towards a modal-structural interpretation","author":"G Hellman","year":"1989","unstructured":"Hellman, G. (1989). Mathematics without numbers: Towards a modal-structural interpretation. Oxford: Clarendon Press."},{"key":"4_CR126","first-page":"201","volume-title":"Bertrand Russell. Philosopher of the century","author":"G Kreisel","year":"1967","unstructured":"Kreisel, G. (1967). Mathematical logic: What has it done for the philosophy of mathematics? In R. Shoenman (Ed.), Bertrand Russell. Philosopher of the century (pp. 201\u2013272). London: George Allen & Unwin."},{"key":"4_CR135","doi-asserted-by":"crossref","unstructured":"Lear, J. (1980). Aristotelian infinity. Proceedings of the Aristotelian Society, New Series, 80, 187\u2013210.","DOI":"10.1093\/aristotelian\/80.1.187"},{"key":"4_CR175","doi-asserted-by":"publisher","first-page":"63","DOI":"10.12775\/LLP.1998.004","volume":"6","author":"F Pataut","year":"1998","unstructured":"Pataut, F. (1998). Incompleteness, constructivism and truth. Logic and Logical Philosophy, 6, 63\u201376.","journal-title":"Logic and Logical Philosophy"},{"key":"4_CR180","first-page":"294","volume":"14","author":"H Poincar\u00e9","year":"1906","unstructured":"Poincar\u00e9, H. (1906). Les math\u00e9matiques et la logique. Revue de m\u00e9taphysique et de morale, 14, 294\u2013317.","journal-title":"Revue de m\u00e9taphysique et de morale"},{"key":"4_CR190","volume-title":"Word and object","author":"W V O Quine","year":"1960","unstructured":"Quine, W. V. O. (1960). Word and object. Cambridge, MA: MIT Press."},{"key":"4_CR193","unstructured":"Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133\u2013136). Cambridge, MA: Harvard University Press."},{"key":"4_CR197","first-page":"31","volume-title":"Theories and things","author":"W V O Quine","year":"1981","unstructured":"Quine, W. V. O. (1981c). What price bivalence? In Theories and things (pp. 31\u201337). Cambridge, MA: The Belknap Press of Harvard University Press."},{"issue":"1","key":"4_CR198","doi-asserted-by":"publisher","first-page":"5","DOI":"10.2307\/2026889","volume":"89","author":"WV O Quine","year":"1992","unstructured":"Quine, W.V. O. (1992). Structure and nature. The Journal of Philosophy, 89(1), 5\u20139.","journal-title":"The Journal of Philosophy"},{"key":"4_CR199","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1080\/014453401625669","volume":"25","author":"P Raatikainen","year":"2004","unstructured":"Raatikainen, P. (2004). Conceptions of truth in intuitionism. History and Philosophy of Logic, 25, 131\u2013145.","journal-title":"History and Philosophy of Logic"},{"key":"4_CR210","unstructured":"Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29\u201353."},{"key":"4_CR212","volume-title":"Introduction to mathematical philosophy","author":"B Russell","year":"1919","unstructured":"Russell, B. (1919). Introduction to mathematical philosophy. London: Routledge."},{"key":"4_CR222","doi-asserted-by":"crossref","unstructured":"Schwarz, J. T. (2006(1966)). The pernicious influence of mathematics on science. In R. Hersch (Ed.), 18 unconventional essays on the nature of mathematics (Chap. 13, pp. 231\u2013235). New York: Springer.","DOI":"10.1007\/0-387-29831-2_13"},{"key":"4_CR233","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/BF00247187","volume":"12","author":"G Sundholm","year":"1983","unstructured":"Sundholm, G. (1983). Constructions, proofs and the meaning of logical constants. Journal of Philosophical Logic, 12, 151\u2013172.","journal-title":"Journal of Philosophical Logic"},{"issue":"2","key":"4_CR235","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10701-007-9186-9","volume":"38","author":"M Tegmark","year":"2008","unstructured":"Tegmark, M. (2008). The mathematical universe. Foundations of Physics, 38(2), 101\u2013150.","journal-title":"Foundations of Physics"},{"key":"4_CR262","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/0010-0277(90)90003-3","volume":"36","author":"K Wynn","year":"1990","unstructured":"Wynn, K. (1990). Children\u2019s understanding of counting. Cognition, 36, 155\u2013193.","journal-title":"Cognition"}],"container-title":["Synthese Library","Empiricism and Philosophy of Physics"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-64953-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T03:00:39Z","timestamp":1610593239000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":28,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-030-64953-1_4","relation":{},"ISSN":["0166-6991","2542-8292"],"issn-type":[{"value":"0166-6991","type":"print"},{"value":"2542-8292","type":"electronic"}],"published":{"date-parts":[[2021]]},"assertion":[{"value":"14 January 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}} \ No newline at end of file
diff --git a/python/tests/files/crossref_api_work_s1047951103000064.json b/python/tests/files/crossref_api_work_s1047951103000064.json
new file mode 100644
index 0000000..dfb795d
--- /dev/null
+++ b/python/tests/files/crossref_api_work_s1047951103000064.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,6,10]],"date-time":"2021-06-10T05:35:02Z","timestamp":1623303302043},"reference-count":46,"publisher":"Cambridge University Press (CUP)","issue":"1","license":[{"start":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T00:00:00Z","timestamp":1113782400000},"content-version":"unspecified","delay-in-days":807,"URL":"https:\/\/www.cambridge.org\/core\/terms"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cardiol Young"],"published-print":{"date-parts":[[2003,2]]},"abstract":"<jats:p>We designed a multi-hospital prospective study of children less than 12 years to determine the comparative clinical profile, severity of carditis, and outcome on follow up of patients suffering an initial and recurrent episodes of acute rheumatic fever. The study extended over a period of 3 years, with diagnosis based on the Jones criteria. We included 161 children in the study, 57 having only one episode and 104 with recurrent episodes. Those seen in the first episode were differentiated from those with recurrent episodes on the basis of the history. The severity of carditis was graded by clinical and echocardiographic means. In those suffering their first episode, carditis was significantly less frequent (61.4%) compared to those having recurrent episodes (96.2%). Arthritis was more marked in the first episode (61.4%) compared to recurrent episodes (36.5%). Chorea was also significantly higher in the first episode (15.8%) compared to recurrent episodes (3.8%). Sub-cutaneous nodules were more-or-less the same in those suffering the first (7%) as opposed to recurrent episodes (5.8%), but Erythema marginatum was more marked during the first episode (3.5%), being rare in recurrent episodes at 0.9%. Fever was recorded in approximately the same numbers in first (45.6%) and recurrent episodes (48.1%). Arthralgia, in contrast, was less frequent in first (21.1%) compared to recurrent episodes (32.7%). A history of sore throat was significantly increased amongst those suffering the first episode (54.4%) compared to recurrent episodes (21.2%). When we compared the severity of carditis in the first versus recurrent episodes, at the start of study mild carditis was found in 29.8% versus 10.6%, moderate carditis in 26.3% versus 53.8%, and severe carditis in 5.3% versus 31.8% of cases, respectively. At the end of study, 30.3% of patients suffering their first episode were completely cured of carditis, and all others showed significant improvement compared to those with recurrent episodes, where only 6.8% were cured, little improvement or deterioration being noted in the remainder of the patients. We conclude that the clinical profile of acute rheumatic fever, especially that of carditis, is milder in those suffering their first attack compared to those with recurrent episodes.<\/jats:p>","DOI":"10.1017\/s1047951103000064","type":"journal-article","created":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T11:49:54Z","timestamp":1113824994000},"page":"28-35","source":"Crossref","is-referenced-by-count":11,"title":["Clinical profile of acute rheumatic fever in Pakistan"],"prefix":"10.1017","volume":"13","author":[{"given":"Hasina Suleman","family":"Chagani","sequence":"first","affiliation":[]},{"given":"Kalimuddin","family":"Aziz","sequence":"additional","affiliation":[]}],"member":"56","published-online":{"date-parts":[[2005,4,18]]},"reference":[{"key":"S1047951103000064_ref010","doi-asserted-by":"crossref","unstructured":"Alan L , Bisno . Group A streptococcal infection and acute rheumatic fever. N Engl J Med 1991; 325: 783\u2013793.","DOI":"10.1056\/NEJM199109123251106"},{"key":"S1047951103000064_ref036","doi-asserted-by":"crossref","unstructured":"Abbasi AS , Hashmi JA , Robinson RD , Suraya S , Syed SA . Prevalence of heart disease in school children of Karachi. Am J Cardiol 1966; 18: 544\u2013547.","DOI":"10.1016\/0002-9149(66)90008-7"},{"key":"S1047951103000064_ref025","unstructured":"Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285\u2013294."},{"key":"S1047951103000064_ref013","unstructured":"Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185\u2013192."},{"key":"S1047951103000064_ref007","doi-asserted-by":"crossref","unstructured":"Okoroma EO , Ihenacho HNC , Anyanwu CH . Rheumatic fever in Nigerian children. A prospective study of 66 patients. Am J Dis Child 1981; 35: 236\u2013238.","DOI":"10.1001\/archpedi.1981.02130270028010"},{"key":"S1047951103000064_ref031","doi-asserted-by":"crossref","unstructured":"Gordis L . Effectiveness of comprehensive care program in preventing rheumatic fever. N Engl J Med 1973; 289: 331\u2013335.","DOI":"10.1056\/NEJM197308162890701"},{"key":"S1047951103000064_ref012","unstructured":"Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21\u201324."},{"key":"S1047951103000064_ref026","doi-asserted-by":"crossref","unstructured":"Reale A , Colella C , Bruno AM . Mitral stenosis in childhood: Clinical and therapeutic aspects. Am Heart J 1963; 66: 15.","DOI":"10.1016\/0002-8703(63)90064-4"},{"key":"S1047951103000064_ref046","doi-asserted-by":"crossref","unstructured":"Aziz KU , Cheema L , Memon AD . Long-term observations of rheumatic carditis. Cardiol Young 1992; 2: 254\u2013260.","DOI":"10.1017\/S1047951100001001"},{"key":"S1047951103000064_ref041","unstructured":"Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300\u2013305."},{"key":"S1047951103000064_ref002","unstructured":"Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889."},{"key":"S1047951103000064_ref043","unstructured":"Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336\u2013345."},{"key":"S1047951103000064_ref037","unstructured":"Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2\u20136."},{"key":"S1047951103000064_ref029","doi-asserted-by":"crossref","unstructured":"Hassel TA , Stuart KL . Rheumatic fever prophylaxis. A three-year study. Br Med J 1972; 2: 39\u201340.","DOI":"10.1136\/bmj.2.5909.39"},{"key":"S1047951103000064_ref024","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Berry AM , Duggal S , Hooja V , Ghosh S . Sequel of initial attack of acute rheumatic fever. A prospective 5-year follow-up study. Circulation 1982; 65: 375\u2013379.","DOI":"10.1161\/01.CIR.65.2.375"},{"key":"S1047951103000064_ref022","doi-asserted-by":"crossref","unstructured":"Brownell KD , Rese FB . Acute rheumatic fever in children. Incidence in Borough of New York city. JAMA. 1973; 224: 1593\u20131597.","DOI":"10.1001\/jama.1973.03220260015004"},{"key":"S1047951103000064_ref035","unstructured":"Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071\u20131081."},{"key":"S1047951103000064_ref003","unstructured":"El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980's. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183\u2013203."},{"key":"S1047951103000064_ref045","doi-asserted-by":"crossref","unstructured":"Markowitz M . Eradication of rheumatic fever. An unfulfilled hope. Circulation 1970; 41: 1077\u20131084.","DOI":"10.1161\/01.CIR.41.6.1077"},{"key":"S1047951103000064_ref005","unstructured":"Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886."},{"key":"S1047951103000064_ref017","unstructured":"Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483\u2013494."},{"key":"S1047951103000064_ref028","doi-asserted-by":"crossref","unstructured":"Ehmke DA , Stehbens JA , Young L . Two studies of compliance with daily prophylaxis in rheumatic fever patients in Iowa. Am J Public Health 1980; 70: 1189\u20131193.","DOI":"10.2105\/AJPH.70.11.1189"},{"key":"S1047951103000064_ref021","doi-asserted-by":"crossref","unstructured":"Ward C . The reappraisal of the clinical features in acute and chronic rheumatic heart disease. Etiology implications. Am Heart J 1979; 98: 298\u2013306.","DOI":"10.1016\/0002-8703(79)90040-1"},{"key":"S1047951103000064_ref009","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Thaper MK , Ahmed SA , Hooja V , Tewari P . The initial attack of acute rheumatic fever during childhood in North India. A prospective study of the clinical profile. Circulation 1974; 49: 7\u201312.","DOI":"10.1161\/01.CIR.49.1.7"},{"key":"S1047951103000064_ref016","unstructured":"Strasser T . Rheumatic fever and rheumatic heart disease in the 1970's. WHO Chron. 1978; 32: 18\u201325."},{"key":"S1047951103000064_ref019","doi-asserted-by":"crossref","unstructured":"Bland EF , Jones TD . Rheumatic fever and rheumatic heart disease. A twenty-year report on 1000 patients followed since childhood. Circulation 1951; 4: 836\u2013843.","DOI":"10.1161\/01.CIR.4.6.836"},{"key":"S1047951103000064_ref042","doi-asserted-by":"crossref","unstructured":"Wood HF , McCarty M . Laboratory aids in the diagnosis of rheumatic fever and evaluation of disease activity. Am J Med 1954; 17: 768\u2013774.","DOI":"10.1016\/0002-9343(54)90221-1"},{"key":"S1047951103000064_ref020","doi-asserted-by":"crossref","unstructured":"Baldwin JS , Kerr JM , Kuttner AG , Doyle EF . Observation in rheumatic nodules over 30 years period. J Pediatr 1960; 56: 465\u2013470.","DOI":"10.1016\/S0022-3476(60)80358-7"},{"key":"S1047951103000064_ref004","doi-asserted-by":"crossref","unstructured":"Majeed HA , Khan N , Dabbagh M , Naidi K . Acute rheumatic fever during childhood in Kuwait: The mild nature of initial attack. Ann Trop Paediatr 1981; 1: 13\u201320.","DOI":"10.1080\/02724936.1981.11748053"},{"key":"S1047951103000064_ref001","unstructured":"Brittanica: Book of year 1991. Chicago, 1991."},{"key":"S1047951103000064_ref039","unstructured":"Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990."},{"key":"S1047951103000064_ref040","doi-asserted-by":"crossref","unstructured":"Taranta A , Markowitz M . Rheumatic fever. A guide to its recognition, prevention and cure, with special reference to developing countries. M.T.P. Press Ltd., Boston, 1981.","DOI":"10.1007\/978-94-015-7171-5"},{"key":"S1047951103000064_ref032","unstructured":"Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1\u201315."},{"key":"S1047951103000064_ref014","unstructured":"Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2\u20139."},{"key":"S1047951103000064_ref011","doi-asserted-by":"crossref","unstructured":"Gharib R . Acute rheumatic fever in Shiraz, Iran. It's prevalence and characteristics in two socio-economic groups. Am J Dis Child 1969: 118: 694\u2013699.","DOI":"10.1001\/archpedi.1969.02100040696005"},{"key":"S1047951103000064_ref008","unstructured":"Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543\u2013550."},{"key":"S1047951103000064_ref033","doi-asserted-by":"crossref","unstructured":"Spagnuolo M , Pasternack B , Taranta A . Risk of rheumatic fever recurrences after streptococcal infections. Prospective study of clinical and social factors. N Engl J Med 1971; 285: 641\u2013647.","DOI":"10.1056\/NEJM197109162851201"},{"key":"S1047951103000064_ref038","unstructured":"Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539\u2013549."},{"key":"S1047951103000064_ref023","doi-asserted-by":"crossref","unstructured":"Feinstein AR , Spagnuolo M . The clinical patterns of acute rheumatic fever; A reappraisal. Medicine 1962; 41: 279\u2013305.","DOI":"10.1097\/00005792-196212000-00001"},{"key":"S1047951103000064_ref018","unstructured":"Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501\u20131515."},{"key":"S1047951103000064_ref027","unstructured":"Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8\u201314."},{"key":"S1047951103000064_ref034","unstructured":"Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14\u201316."},{"key":"S1047951103000064_ref044","unstructured":"Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389\u2013395."},{"key":"S1047951103000064_ref006","unstructured":"Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849\u2013853."},{"key":"S1047951103000064_ref030","unstructured":"Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599\u2013603."},{"key":"S1047951103000064_ref015","doi-asserted-by":"crossref","unstructured":"Robinson RD , Sultana S , Abbasi AS et al. Acute rheumatic fever in Karachi, Pakistan. Am J Cardiol 1966; 8: 548\u2013551.","DOI":"10.1016\/0002-9149(66)90009-9"}],"container-title":["Cardiology in the Young"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.cambridge.org\/core\/services\/aop-cambridge-core\/content\/view\/S1047951103000064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T22:32:57Z","timestamp":1586212377000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,2]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2003,2]]}},"alternative-id":["S1047951103000064"],"URL":"http:\/\/dx.doi.org\/10.1017\/s1047951103000064","relation":{},"ISSN":["1047-9511","1467-1107"],"issn-type":[{"value":"1047-9511","type":"print"},{"value":"1467-1107","type":"electronic"}],"subject":["Cardiology and Cardiovascular Medicine","General Medicine","Pediatrics, Perinatology, and Child Health"],"published":{"date-parts":[[2003,2]]}}} \ No newline at end of file
diff --git a/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/grobid_refs_s1047951103000064.tei.xml b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
new file mode 100644
index 0000000..e0eae8a
--- /dev/null
+++ b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
@@ -0,0 +1,499 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">The community control of rheumatic fever and rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">N</forename><surname>Dondong</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Elkholy</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="285" to="294" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note>Report of a WHO international co-operative project</note>
+ <note type="raw_reference">Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285–294.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Rehman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="185" to="192" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185–192.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever in Sudanese children</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Ismail</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>El Amin</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Arab J Med</title>
+ <imprint>
+ <biblScope unit="volume">2</biblScope>
+ <biblScope unit="page" from="21" to="24" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21–24.</note>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Incidence of heart disease in children at NICVD</title>
+ <author>
+ <persName><forename type="first">K</forename><forename type="middle">U</forename><surname>Aziz</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="300" to="305" />
+ <date type="published" when="1984">1984</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300–305.</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m" type="main">The various manifestations of rheumatic fever as exemplified in childhood and early life</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Cheadle</surname></persName>
+ </author>
+ <imprint>
+ <publisher>Smith and Co</publisher>
+ <biblScope unit="page">1889</biblScope>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889.</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-I. A major public health problem</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="336" to="345" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336–345.</note>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <analytic>
+ <title level="a" type="main">Prevalence of heart disease in school children of Islamabad</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Malik</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Jaffrey</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">Zubeda</forename><surname>Khanum</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pakistan Heart Journal</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <biblScope unit="page" from="2" to="6" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2–6.</note>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease and overcrowding</title>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Watkins</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Quinn</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Am J Public Health</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="1071" to="1081" />
+ <date type="published" when="1948">1948</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071–1081.</note>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <analytic>
+ <title level="a" type="main">The spectrum and specter of rheumatic fever in 1980&apos;s</title>
+ <author>
+ <persName><forename type="first">W</forename><surname>El-Sadr</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Taranta</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Clinical Immunology Up-Date. Edited by Franklin EC</title>
+ <imprint>
+ <biblScope unit="page" from="183" to="203" />
+ <date type="published" when="1979">1979</date>
+ <publisher>Elsevier</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980&apos;s. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183–203.</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">Tonsillitis in adolescent, Bailliere Tendoll and Cox</title>
+ <author>
+ <persName><forename type="first">C</forename><surname>Haig-Brown</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1886">1886</date>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886.</note>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Studies on the transmission within the families of group A hemolytic streptococci</title>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">I</forename><surname>Levine</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Chapman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Guerra</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><surname>Cooper</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Krause</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">J Lab Clin Med</title>
+ <imprint>
+ <biblScope unit="volume">67</biblScope>
+ <biblScope unit="page" from="483" to="494" />
+ <date type="published" when="1966">1966</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483–494.</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="volume">32</biblScope>
+ <biblScope unit="page" from="18" to="25" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Strasser T . Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron. 1978; 32: 18–25.</note>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Brittanica: Book of year 1991</title>
+ <imprint>
+ <date type="published" when="1991">1991</date>
+ <publisher>Chicago</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Brittanica: Book of year 1991. Chicago, 1991.</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <monogr>
+ <title level="m" type="main">Pockets of rheumatic fever in developed world. XI World Congress of Cardiology</title>
+ <author>
+ <persName><forename type="first">R</forename><surname>Talbot</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990">1990</date>
+ <pubPlace>Manila</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990.</note>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease</title>
+ </analytic>
+ <monogr>
+ <title level="j">Circulation</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="A1" to="15" />
+ <date type="published" when="1970">1970</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1–15.</note>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever and rheumatic carditis in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Shafqat</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ramzan</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="2" to="9" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2–9.</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in developing countries</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Padmavati</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">56</biblScope>
+ <biblScope unit="page" from="543" to="550" />
+ <date type="published" when="1979">1979</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543–550.</note>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">Streptococcal infections in families. Factors altering individual susceptibility</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Meyer</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Haggerty</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pediatrics</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="539" to="549" />
+ <date type="published" when="1962">1962</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539–549.</note>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Collagen and connective tissue diseases</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Shanks</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Textbook of Pediatrics</title>
+ <editor>
+ <persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Forfar</surname></persName>
+ <persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Arneil</surname></persName>
+ </editor>
+ <meeting><address><addrLine>Edinburgh</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="page" from="1501" to="1515" />
+ </imprint>
+ <respStmt>
+ <orgName>Churchill Livingstone</orgName>
+ </respStmt>
+ </monogr>
+ <note type="raw_reference">Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501–1515.</note>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Prophylaxis against recurrence of rheumatic fever</title>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Billoo</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">S</forename><surname>Abbasi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Sultana</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><surname>Desa</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <biblScope unit="page" from="8" to="14" />
+ <date type="published" when="1968">1968</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8–14.</note>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="page" from="14" to="16" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14–16.</note>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="389" to="395" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389–395.</note>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever: Clinical profile of 339 cases with long term follow-up</title>
+ <author>
+ <persName><forename type="first">M</forename><forename type="middle">K</forename><surname>Joshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">P</forename><forename type="middle">W</forename><surname>Kandoth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Barve</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Kamat</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Indian pediatr</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="849" to="853" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849–853.</note>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in rural south Indian children</title>
+ <author>
+ <persName><forename type="first">G</forename><surname>Koshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Benjamin</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Cherian</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="599" to="603" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599–603.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index c086d73..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,3 +1,4 @@
+import json
import struct
import pytest
@@ -41,7 +42,10 @@ def test_grobid_503(grobid_client):
@responses.activate
-def test_grobid_success(grobid_client):
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
responses.add(
responses.POST,
@@ -64,6 +68,27 @@ def test_grobid_success(grobid_client):
@responses.activate
+def test_grobid_success(grobid_client):
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
+
+@responses.activate
def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
@@ -90,3 +115,108 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
assert pusher_counts["pushed"] == worker.counts["total"]
assert len(responses.calls) == worker.counts["total"]
+
+
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )