aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/grobid_affiliations.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/grobid_affiliations.py')
-rwxr-xr-xpython/scripts/grobid_affiliations.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 2b1d29e..79feac1 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -4,6 +4,10 @@
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
dumps these as JSON subset.
+
+Run in bulk like:
+
+ ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
import sys
@@ -41,6 +45,7 @@ def run(mode='hbase'):
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
+ affiliations = [json.loads(a) for a in affiliations]
print('\t'.join([sha1hex, json.dumps(affiliations)]))
if __name__=='__main__':