diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-05-13 16:41:45 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-05-13 16:41:45 -0700 | 
| commit | 594678f6c2705b8b88c6e23d68981a851df0aa5e (patch) | |
| tree | 955934c9dc84b1fcb804c4216bc47390865bfafe /python | |
| parent | 5b0aad1d8d100832c4a73dd006d8196aa995b4f0 (diff) | |
| download | sandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.tar.gz sandcrawler-594678f6c2705b8b88c6e23d68981a851df0aa5e.zip  | |
update grobid2json to include given_name/surname
Diffstat (limited to 'python')
| -rwxr-xr-x | python/grobid2json.py | 10 | ||||
| -rw-r--r-- | python/tests/files/small.json | 6 | 
2 files changed, 10 insertions, 6 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index ca460f8..d438d48 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -31,9 +31,13 @@ import xml.etree.ElementTree as ET  ns = "http://www.tei-c.org/ns/1.0"  def all_authors(elem): -    names = [' '.join([e.findtext('./{%s}forename' % ns) or '', e.findtext('./{%s}surname' % ns) or '']).strip() -            for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns))] -    return [dict(name=n) for n in names] +    names = [] +    for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)): +        given_name = e.findtext('./{%s}forename' % ns) or None +        surname = e.findtext('./{%s}surname' % ns) or None +        full_name = '{} {}'.format(given_name or '', surname or '').strip() +        names.append(dict(name=full_name, given_name=given_name, surname=surname)) +    return names  def journal_info(elem): diff --git a/python/tests/files/small.json b/python/tests/files/small.json index 208fb49..49a5671 100644 --- a/python/tests/files/small.json +++ b/python/tests/files/small.json @@ -1,8 +1,8 @@  {    "title": "Dummy Example File",    "authors": [ -    {"name": "Brewster Kahle"}, -    {"name": "J Doe"} +    {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"}, +    {"name": "J Doe", "given_name": "J", "surname": "Doe"}    ],    "journal": {      "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", @@ -15,7 +15,7 @@    "date": "2000",    "doi": null,    "citations": [ -    { "authors": [{"name": "A Seaperson"}], +    { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],        "date": "2001",        "id": "b0",        "index": 0,  | 
