aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-02 18:02:55 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-02 18:02:55 -0700
commitf2736d6af81618fede2de76954cc6914c30706fb (patch)
treeea5385f2256d0e3eb92102be56bf2dd64756d975
parent910e59e9011935fecaae62520eb3fc30cbd65800 (diff)
downloadsandcrawler-f2736d6af81618fede2de76954cc6914c30706fb.tar.gz
sandcrawler-f2736d6af81618fede2de76954cc6914c30706fb.zip
grobid affiliation extractor (script)
-rwxr-xr-xpython/scripts/grobid_affiliations.py47
1 files changed, 47 insertions, 0 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
new file mode 100755
index 0000000..2b1d29e
--- /dev/null
+++ b/python/scripts/grobid_affiliations.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+"""
+Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
+output, converts the XML to JSON, filters out raw affiliation strings, and
+dumps these as JSON subset.
+"""
+
+import sys
+import json
+
+from grobid2json import teixml2json
+
+def parse_hbase(line):
+ line = line.split('\t')
+ assert len(line) == 2
+ sha1hex = line[0]
+ obj = json.loads(line[1])
+ tei_xml = obj['tei_xml']
+ return sha1hex, tei_xml
+
+def parse_pg(line):
+ obj = json.loads(line)
+ return obj['sha1hex'], obj['tei_xml']
+
+def run(mode='hbase'):
+ for line in sys.stdin:
+ if mode == 'hbase':
+ sha1hex, tei_xml = parse_hbase(line)
+ elif mode == 'pg':
+ sha1hex, tei_xml = parse_pg(line)
+ else:
+ raise NotImplementedError('parse mode: {}'.format(mode))
+
+ obj = teixml2json(tei_xml, encumbered=False)
+
+ affiliations = []
+ for author in obj['authors']:
+ if author.get('affiliation'):
+ affiliations.append(author['affiliation'])
+ if affiliations:
+ # don't duplicate affiliations; only the unique ones
+ affiliations = list(set([json.dumps(a) for a in affiliations]))
+ print('\t'.join([sha1hex, json.dumps(affiliations)]))
+
+if __name__=='__main__':
+ run()