diff options
-rwxr-xr-x | covid19_tool.py | 18 | ||||
-rw-r--r-- | fatcat_covid19/derivatives.py | 2 | ||||
-rw-r--r-- | fatcat_covid19/enrich.py | 3 | ||||
-rw-r--r-- | fatcat_covid19/transform.py | 5 |
4 files changed, 19 insertions, 9 deletions
diff --git a/covid19_tool.py b/covid19_tool.py index 1cd0e48..b9bea44 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -9,7 +9,7 @@ Licensed the same as code under fatcat_covid19/ import sys import argparse -from fatcat_covid19.webface import app +from fatcat_covid19.enrich import enrich_fatcat_file from fatcat_covid19.derivatives import enrich_derivatives_file from fatcat_covid19.transform import transform_es_file @@ -40,12 +40,15 @@ def main(): sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat', help="lookup fatcat releases from JSON metadata") + sub_enrich_fatcat.set_defaults( + action='enrich-fatcat', + ) sub_enrich_fatcat.add_argument('json_file', help="input JSON rows file (eg, CORD-19 parsed JSON)", type=argparse.FileType('r')) sub_enrich_fatcat.add_argument('--json-output', help="file to write to", - type=argparse.FileType('r'), + type=argparse.FileType('w'), default=sys.stdout) sub_enrich_derivatives = subparsers.add_parser('enrich-derivatives', @@ -58,7 +61,7 @@ def main(): type=argparse.FileType('r')) sub_enrich_derivatives.add_argument('--json-output', help="file to write ", - type=argparse.FileType('r'), + type=argparse.FileType('w'), default=sys.stdout) sub_enrich_derivatives.add_argument('--base-dir', help="directory to look for files (in 'pdf' subdirectory)", @@ -66,20 +69,25 @@ def main(): sub_transform_es = subparsers.add_parser('transform-es', help="transform fulltext JSON to elasticsearch schema JSON") + sub_transform_es.set_defaults( + action='transform-es', + ) sub_transform_es.add_argument('json_file', help="input JSON rows file (fulltext)", type=argparse.FileType('r')) sub_transform_es.add_argument('--json-output', help="file to write to", - type=argparse.FileType('r'), + type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() if args.action == 'webface': + # don't import until we use app; otherwise sentry exception reporting happens + from fatcat_covid19.webface import app app.run(debug=args.debug, host=args.host, port=args.port) elif args.action == 'enrich-fatcat': - transform_es_file(args.json_file, args.json_output) + enrich_fatcat_file(args.json_file, args.json_output) elif args.action == 'enrich-derivatives': enrich_derivatives_file(args.json_file, args.json_output, args.base_dir) diff --git a/fatcat_covid19/derivatives.py b/fatcat_covid19/derivatives.py index 5ade0ef..c9339e8 100644 --- a/fatcat_covid19/derivatives.py +++ b/fatcat_covid19/derivatives.py @@ -126,7 +126,7 @@ def enrich_derivatives_file(json_input, json_output, base_dir): """ for l in json_input: l = json.loads(l) - result = do_line(l, base_dir) + result = enrich_derivatives_row(l, base_dir) if result: print(json.dumps(result, sort_keys=True), file=json_output) diff --git a/fatcat_covid19/enrich.py b/fatcat_covid19/enrich.py index 458c83d..e7d6da2 100644 --- a/fatcat_covid19/enrich.py +++ b/fatcat_covid19/enrich.py @@ -51,7 +51,7 @@ def enrich_fatcat_row(row, api_session): if fatcat_release: row['fatcat_release'] = fatcat_release row['release_id'] = fatcat_release['ident'] - print(json.dumps(row, sort_keys=True)) + return row def enrich_fatcat_file(json_input, json_output): @@ -65,3 +65,4 @@ def enrich_fatcat_file(json_input, json_output): result = enrich_fatcat_row(l, api_session) if result: print(json.dumps(result, sort_keys=True), file=json_output) + diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index c31c9f4..16774ab 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -68,7 +68,8 @@ def fulltext_to_elasticsearch(row, force_bool=True): if release.get('abstracts'): for a in release['abstracts']: abstracts.append(a['content']) - abstract_langs.append(a['lang']) + if a.get('lang'): + abstract_langs.append(a['lang']) contrib_names = [] contrib_affiliations = [] @@ -199,6 +200,6 @@ def transform_es_file(json_input, json_output): """ for l in json_input: l = json.loads(l) - result = fulltext_to_elasticsearch(l, args) + result = fulltext_to_elasticsearch(l) if result: print(json.dumps(result, sort_keys=True), file=json_output) |