diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-05-13 16:41:45 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-05-13 16:41:45 -0700 |
commit | 594678f6c2705b8b88c6e23d68981a851df0aa5e (patch) | |
tree | 955934c9dc84b1fcb804c4216bc47390865bfafe /python/grobid2json.py | |
parent | 5b0aad1d8d100832c4a73dd006d8196aa995b4f0 (diff) | |
download | sandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.tar.gz sandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.zip |
update grobid2json to include given_name/surname
Diffstat (limited to 'python/grobid2json.py')
-rwxr-xr-x | python/grobid2json.py | 10 |
1 files changed, 7 insertions, 3 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index ca460f8..d438d48 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -31,9 +31,13 @@ import xml.etree.ElementTree as ET ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): - names = [' '.join([e.findtext('./{%s}forename' % ns) or '', e.findtext('./{%s}surname' % ns) or '']).strip() - for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns))] - return [dict(name=n) for n in names] + names = [] + for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)): + given_name = e.findtext('./{%s}forename' % ns) or None + surname = e.findtext('./{%s}surname' % ns) or None + full_name = '{} {}'.format(given_name or '', surname or '').strip() + names.append(dict(name=full_name, given_name=given_name, surname=surname)) + return names def journal_info(elem): |