aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-05-13 16:41:45 -0700
committerBryan Newbold <bnewbold@archive.org>2019-05-13 16:41:45 -0700
commit594678f6c2705b8b88c6e23d68981a851df0aa5e (patch)
tree955934c9dc84b1fcb804c4216bc47390865bfafe /python
parent5b0aad1d8d100832c4a73dd006d8196aa995b4f0 (diff)
downloadsandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.tar.gz
sandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.zip
update grobid2json to include given_name/surname
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid2json.py10
-rw-r--r--python/tests/files/small.json6
2 files changed, 10 insertions, 6 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index ca460f8..d438d48 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -31,9 +31,13 @@ import xml.etree.ElementTree as ET
ns = "http://www.tei-c.org/ns/1.0"
def all_authors(elem):
- names = [' '.join([e.findtext('./{%s}forename' % ns) or '', e.findtext('./{%s}surname' % ns) or '']).strip()
- for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns))]
- return [dict(name=n) for n in names]
+ names = []
+ for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)):
+ given_name = e.findtext('./{%s}forename' % ns) or None
+ surname = e.findtext('./{%s}surname' % ns) or None
+ full_name = '{} {}'.format(given_name or '', surname or '').strip()
+ names.append(dict(name=full_name, given_name=given_name, surname=surname))
+ return names
def journal_info(elem):
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index 208fb49..49a5671 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -1,8 +1,8 @@
{
"title": "Dummy Example File",
"authors": [
- {"name": "Brewster Kahle"},
- {"name": "J Doe"}
+ {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"},
+ {"name": "J Doe", "given_name": "J", "surname": "Doe"}
],
"journal": {
"name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
@@ -15,7 +15,7 @@
"date": "2000",
"doi": null,
"citations": [
- { "authors": [{"name": "A Seaperson"}],
+ { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
"date": "2001",
"id": "b0",
"index": 0,