diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 17:56:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 17:56:44 -0700 |
commit | 3f16467eb83ae95c8ecf7a3ce261ea8ee0ccc509 (patch) | |
tree | 0eb5fdece71c942b5c52286fe517ac8239ba1dfb | |
parent | 0da57660b034e051a45e84b18bd142f8dd6be927 (diff) | |
download | sandcrawler-3f16467eb83ae95c8ecf7a3ce261ea8ee0ccc509.tar.gz sandcrawler-3f16467eb83ae95c8ecf7a3ce261ea8ee0ccc509.zip |
have grobid2json extract full names and affiliations
-rwxr-xr-x | python/grobid2json.py | 32 |
1 files changed, 27 insertions, 5 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index f3577b0..e5af9d3 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -32,11 +32,33 @@ ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): names = [] - for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)): - given_name = e.findtext('./{%s}forename' % ns) or None - surname = e.findtext('./{%s}surname' % ns) or None - full_name = '{} {}'.format(given_name or '', surname or '').strip() - names.append(dict(name=full_name, given_name=given_name, surname=surname)) + for author in elem.findall('.//{%s}author' % ns): + pn = author.find('./{%s}persName' % ns) + if not pn: + continue + given_name = pn.findtext('./{%s}forename' % ns) or None + surname = pn.findtext('./{%s}surname' % ns) or None + full_name = ' '.join(pn.itertext()) + obj = dict(name=full_name, given_name=given_name, surname=surname) + ae = author.find('./{%s}affiliation' % ns) + if ae: + affiliation = dict() + for on in ae.findall('./{%s}orgName' % ns): + affiliation[on.get('type')] = on.text + addr_e = ae.find('./{%s}address' % ns) + if addr_e: + address = dict() + for t in addr_e.getchildren(): + address[t.tag.split('}')[-1]] = t.text + if address: + affiliation['address'] = address + #affiliation['address'] = { + # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, + # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, + # 'country': addr.findtext('./{%s}country' % ns) or None, + #} + obj['affiliation'] = affiliation + names.append(obj) return names |