aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/grobid_affiliations.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/grobid_affiliations.py
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/scripts/grobid_affiliations.py')
-rwxr-xr-xpython/scripts/grobid_affiliations.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index d391f60..b42153c 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -24,10 +23,12 @@ def parse_hbase(line):
tei_xml = obj['tei_xml']
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
return obj['sha1hex'], obj['tei_xml']
+
def run(mode='hbase'):
for line in sys.stdin:
if mode == 'hbase':
@@ -49,5 +50,6 @@ def run(mode='hbase'):
affiliations = [json.loads(a) for a in affiliations]
print('\t'.join([sha1hex, json.dumps(affiliations)]))
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()