aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-08 15:59:59 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-08 15:59:59 -0700
commit858147b071103c505bff643e35b503c623f20284 (patch)
treeacfc5cdba1315cf99ae31aac0cf969623e81f1e5
parent1552f5dcd8b0abe89f53182c7a495a7b8dc008fc (diff)
downloadfatcat-covid19-858147b071103c505bff643e35b503c623f20284.tar.gz
fatcat-covid19-858147b071103c505bff643e35b503c623f20284.zip
refactor parse_cord19_csv.py into tool
-rwxr-xr-xbin/parse_cord19_csv.py21
-rwxr-xr-xcovid19_tool.py52
-rw-r--r--fatcat_covid19/parse.py21
3 files changed, 55 insertions, 39 deletions
diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py
deleted file mode 100755
index dbc6cc5..0000000
--- a/bin/parse_cord19_csv.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple
-of the column keys.
-"""
-
-import sys
-import csv
-import json
-
-CSVFILE = sys.argv[1]
-
-with open(CSVFILE, newline='') as csvfile:
- reader = csv.DictReader(csvfile)
- for row in reader:
- row = dict(row)
- row['mag_id'] = row.pop('Microsoft Academic Paper ID')
- row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '')
- obj = dict(cord19_paper=row)
- print(json.dumps(obj, sort_keys=True))
diff --git a/covid19_tool.py b/covid19_tool.py
index b9bea44..345aa6e 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -9,6 +9,7 @@ Licensed the same as code under fatcat_covid19/
import sys
import argparse
+from fatcat_covid19.parse import parse_cord19_file
from fatcat_covid19.enrich import enrich_fatcat_file
from fatcat_covid19.derivatives import enrich_derivatives_file
from fatcat_covid19.transform import transform_es_file
@@ -22,21 +23,18 @@ def main():
)
subparsers = parser.add_subparsers()
- sub_webface = subparsers.add_parser('webface',
- help="run flask web interface")
- sub_webface.set_defaults(
- action='webface',
+ sub_parse_cord = subparsers.add_parser('parse-cord19',
+ help="parse a CORD-19 CSV file into JSON")
+ sub_parse_cord.set_defaults(
+ action='parse-cord19',
)
- sub_webface.add_argument('--debug',
- action='store_true',
- help="enable debugging interface (note: not for everything)")
- sub_webface.add_argument('--host',
- default="127.0.0.1",
- help="listen on this host/IP")
- sub_webface.add_argument('--port',
- type=int,
- default=9119,
- help="listen on this port")
+ sub_parse_cord.add_argument('csv_path',
+ help="input CSV file path",
+ type=str)
+ sub_parse_cord.add_argument('--json-output',
+ help="file to write to",
+ type=argparse.FileType('w'),
+ default=sys.stdout)
sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat',
help="lookup fatcat releases from JSON metadata")
@@ -80,12 +78,26 @@ def main():
type=argparse.FileType('w'),
default=sys.stdout)
+ sub_webface = subparsers.add_parser('webface',
+ help="run flask web interface")
+ sub_webface.set_defaults(
+ action='webface',
+ )
+ sub_webface.add_argument('--debug',
+ action='store_true',
+ help="enable debugging interface (note: not for everything)")
+ sub_webface.add_argument('--host',
+ default="127.0.0.1",
+ help="listen on this host/IP")
+ sub_webface.add_argument('--port',
+ type=int,
+ default=9119,
+ help="listen on this port")
+
args = parser.parse_args()
- if args.action == 'webface':
- # don't import until we use app; otherwise sentry exception reporting happens
- from fatcat_covid19.webface import app
- app.run(debug=args.debug, host=args.host, port=args.port)
+ if args.action == 'parse-cord19':
+ parse_cord19_file(args.csv_path, args.json_output)
elif args.action == 'enrich-fatcat':
enrich_fatcat_file(args.json_file, args.json_output)
elif args.action == 'enrich-derivatives':
@@ -93,6 +105,10 @@ def main():
args.base_dir)
elif args.action == 'transform-es':
transform_es_file(args.json_file, args.json_output)
+ elif args.action == 'webface':
+ # don't import until we use app; otherwise sentry exception reporting happens
+ from fatcat_covid19.webface import app
+ app.run(debug=args.debug, host=args.host, port=args.port)
else:
print("tell me what to do!")
sys.exit(-1)
diff --git a/fatcat_covid19/parse.py b/fatcat_covid19/parse.py
new file mode 100644
index 0000000..ce2bf26
--- /dev/null
+++ b/fatcat_covid19/parse.py
@@ -0,0 +1,21 @@
+
+import sys
+import csv
+import json
+
+
+def parse_cord19_file(csv_path, json_output):
+ """
+ Trivial helper to transform the CORD-19 CSV file to JSON, and rename a
+ couple of the column keys.
+ """
+
+ with open(csv_path, newline='') as csvfile:
+ reader = csv.DictReader(csvfile)
+ for row in reader:
+ row = dict(row)
+ row['mag_id'] = row.pop('Microsoft Academic Paper ID')
+ row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '')
+ obj = dict(cord19_paper=row)
+ print(json.dumps(obj, sort_keys=True), file=json_output)
+