diff options
Diffstat (limited to 'python/scripts/grobid_affiliations.py')
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 37 |
1 files changed, 21 insertions, 16 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..90a0f77 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction output, converts the XML to JSON, filters out raw affiliation strings, and @@ -10,43 +9,49 @@ Run in bulk like: ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' """ -import sys import json +import sys + +from grobid_tei_xml import parse_document_xml -from grobid2json import teixml2json def parse_hbase(line): - line = line.split('\t') + line = line.split("\t") assert len(line) == 2 sha1hex = line[0] obj = json.loads(line[1]) - tei_xml = obj['tei_xml'] + tei_xml = obj["tei_xml"] return sha1hex, tei_xml + def parse_pg(line): obj = json.loads(line) - return obj['sha1hex'], obj['tei_xml'] + return obj["sha1hex"], obj["tei_xml"] + -def run(mode='hbase'): +def run(mode="hbase"): for line in sys.stdin: - if mode == 'hbase': + if mode == "hbase": sha1hex, tei_xml = parse_hbase(line) - elif mode == 'pg': + elif mode == "pg": sha1hex, tei_xml = parse_pg(line) else: - raise NotImplementedError('parse mode: {}'.format(mode)) + raise NotImplementedError("parse mode: {}".format(mode)) - obj = teixml2json(tei_xml, encumbered=False) + tei_doc = parse_document_xml(tei_xml) + tei_doc.remove_encumbered() + obj = tei_doc.to_legacy_dict() affiliations = [] - for author in obj['authors']: - if author.get('affiliation'): - affiliations.append(author['affiliation']) + for author in obj["authors"]: + if author.get("affiliation"): + affiliations.append(author["affiliation"]) if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) affiliations = [json.loads(a) for a in affiliations] - print('\t'.join([sha1hex, json.dumps(affiliations)])) + print("\t".join([sha1hex, json.dumps(affiliations)])) + -if __name__=='__main__': +if __name__ == "__main__": run() |