diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 19:10:21 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 19:10:21 -0700 |
commit | 1c3e95612d56681fede45f8c8ec3991f9cb9ac1d (patch) | |
tree | 6ac1bdf5b4579f0eab786e8d74e8a33f0eddd899 /python | |
parent | 79ebb924dbd3c724ecfbe2b00e18f0ae7e51d4a6 (diff) | |
download | sandcrawler-1c3e95612d56681fede45f8c8ec3991f9cb9ac1d.tar.gz sandcrawler-1c3e95612d56681fede45f8c8ec3991f9cb9ac1d.zip |
grobid_affiliations fix from prod, and usage example
Diffstat (limited to 'python')
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 2b1d29e..79feac1 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -4,6 +4,10 @@ Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction output, converts the XML to JSON, filters out raw affiliation strings, and dumps these as JSON subset. + +Run in bulk like: + + ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' """ import sys @@ -41,6 +45,7 @@ def run(mode='hbase'): if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) + affiliations = [json.loads(a) for a in affiliations] print('\t'.join([sha1hex, json.dumps(affiliations)])) if __name__=='__main__': |