diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 18:02:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 18:02:55 -0700 |
commit | f2736d6af81618fede2de76954cc6914c30706fb (patch) | |
tree | ea5385f2256d0e3eb92102be56bf2dd64756d975 /python | |
parent | 910e59e9011935fecaae62520eb3fc30cbd65800 (diff) | |
download | sandcrawler-f2736d6af81618fede2de76954cc6914c30706fb.tar.gz sandcrawler-f2736d6af81618fede2de76954cc6914c30706fb.zip |
grobid affiliation extractor (script)
Diffstat (limited to 'python')
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py new file mode 100755 index 0000000..2b1d29e --- /dev/null +++ b/python/scripts/grobid_affiliations.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +""" +Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction +output, converts the XML to JSON, filters out raw affiliation strings, and +dumps these as JSON subset. +""" + +import sys +import json + +from grobid2json import teixml2json + +def parse_hbase(line): + line = line.split('\t') + assert len(line) == 2 + sha1hex = line[0] + obj = json.loads(line[1]) + tei_xml = obj['tei_xml'] + return sha1hex, tei_xml + +def parse_pg(line): + obj = json.loads(line) + return obj['sha1hex'], obj['tei_xml'] + +def run(mode='hbase'): + for line in sys.stdin: + if mode == 'hbase': + sha1hex, tei_xml = parse_hbase(line) + elif mode == 'pg': + sha1hex, tei_xml = parse_pg(line) + else: + raise NotImplementedError('parse mode: {}'.format(mode)) + + obj = teixml2json(tei_xml, encumbered=False) + + affiliations = [] + for author in obj['authors']: + if author.get('affiliation'): + affiliations.append(author['affiliation']) + if affiliations: + # don't duplicate affiliations; only the unique ones + affiliations = list(set([json.dumps(a) for a in affiliations])) + print('\t'.join([sha1hex, json.dumps(affiliations)])) + +if __name__=='__main__': + run() |