diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
commit | 826c7538e091fac14d987a3cd654975da964e240 (patch) | |
tree | 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts/grobid_affiliations.py | |
parent | 020037d4714e7ba2ab172c7278494aed0b2148ad (diff) | |
download | sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip |
make fmt (black 21.9b0)
Diffstat (limited to 'python/scripts/grobid_affiliations.py')
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 24 |
1 files changed, 12 insertions, 12 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index b42153c..b01e46a 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -16,40 +16,40 @@ from grobid2json import teixml2json def parse_hbase(line): - line = line.split('\t') + line = line.split("\t") assert len(line) == 2 sha1hex = line[0] obj = json.loads(line[1]) - tei_xml = obj['tei_xml'] + tei_xml = obj["tei_xml"] return sha1hex, tei_xml def parse_pg(line): obj = json.loads(line) - return obj['sha1hex'], obj['tei_xml'] + return obj["sha1hex"], obj["tei_xml"] -def run(mode='hbase'): +def run(mode="hbase"): for line in sys.stdin: - if mode == 'hbase': + if mode == "hbase": sha1hex, tei_xml = parse_hbase(line) - elif mode == 'pg': + elif mode == "pg": sha1hex, tei_xml = parse_pg(line) else: - raise NotImplementedError('parse mode: {}'.format(mode)) + raise NotImplementedError("parse mode: {}".format(mode)) obj = teixml2json(tei_xml, encumbered=False) affiliations = [] - for author in obj['authors']: - if author.get('affiliation'): - affiliations.append(author['affiliation']) + for author in obj["authors"]: + if author.get("affiliation"): + affiliations.append(author["affiliation"]) if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) affiliations = [json.loads(a) for a in affiliations] - print('\t'.join([sha1hex, json.dumps(affiliations)])) + print("\t".join([sha1hex, json.dumps(affiliations)])) -if __name__ == '__main__': +if __name__ == "__main__": run() |