diff options
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py new file mode 100755 index 0000000..2b1d29e --- /dev/null +++ b/python/scripts/grobid_affiliations.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +""" +Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction +output, converts the XML to JSON, filters out raw affiliation strings, and +dumps these as JSON subset. +""" + +import sys +import json + +from grobid2json import teixml2json + +def parse_hbase(line): + line = line.split('\t') + assert len(line) == 2 + sha1hex = line[0] + obj = json.loads(line[1]) + tei_xml = obj['tei_xml'] + return sha1hex, tei_xml + +def parse_pg(line): + obj = json.loads(line) + return obj['sha1hex'], obj['tei_xml'] + +def run(mode='hbase'): + for line in sys.stdin: + if mode == 'hbase': + sha1hex, tei_xml = parse_hbase(line) + elif mode == 'pg': + sha1hex, tei_xml = parse_pg(line) + else: + raise NotImplementedError('parse mode: {}'.format(mode)) + + obj = teixml2json(tei_xml, encumbered=False) + + affiliations = [] + for author in obj['authors']: + if author.get('affiliation'): + affiliations.append(author['affiliation']) + if affiliations: + # don't duplicate affiliations; only the unique ones + affiliations = list(set([json.dumps(a) for a in affiliations])) + print('\t'.join([sha1hex, json.dumps(affiliations)])) + +if __name__=='__main__': + run() |