From 594678f6c2705b8b88c6e23d68981a851df0aa5e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 13 May 2019 16:41:45 -0700 Subject: update grobid2json to include given_name/surname --- python/grobid2json.py | 10 +++++++--- python/tests/files/small.json | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/grobid2json.py b/python/grobid2json.py index ca460f8..d438d48 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -31,9 +31,13 @@ import xml.etree.ElementTree as ET ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): - names = [' '.join([e.findtext('./{%s}forename' % ns) or '', e.findtext('./{%s}surname' % ns) or '']).strip() - for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns))] - return [dict(name=n) for n in names] + names = [] + for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)): + given_name = e.findtext('./{%s}forename' % ns) or None + surname = e.findtext('./{%s}surname' % ns) or None + full_name = '{} {}'.format(given_name or '', surname or '').strip() + names.append(dict(name=full_name, given_name=given_name, surname=surname)) + return names def journal_info(elem): diff --git a/python/tests/files/small.json b/python/tests/files/small.json index 208fb49..49a5671 100644 --- a/python/tests/files/small.json +++ b/python/tests/files/small.json @@ -1,8 +1,8 @@ { "title": "Dummy Example File", "authors": [ - {"name": "Brewster Kahle"}, - {"name": "J Doe"} + {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"}, + {"name": "J Doe", "given_name": "J", "surname": "Doe"} ], "journal": { "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", @@ -15,7 +15,7 @@ "date": "2000", "doi": null, "citations": [ - { "authors": [{"name": "A Seaperson"}], + { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}], "date": "2001", "id": "b0", "index": 0, -- cgit v1.2.3