diff options
-rwxr-xr-x | covid19_tool.py | 14 | ||||
-rw-r--r--[-rwxr-xr-x] | fatcat_covid19/transform.py (renamed from elastic_transform.py) | 33 |
2 files changed, 21 insertions, 26 deletions
diff --git a/covid19_tool.py b/covid19_tool.py index 23a2c6c..1cf8dce 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -10,6 +10,8 @@ import sys import argparse from fatcat_covid19.webface import app +from fatcat_covid19.derivatives import enrich_derivatives_file +from fatcat_covid19.transform import transform_es_file def main(): @@ -58,6 +60,16 @@ def main(): help="directory to look for files (in 'pdf' subdirectory)", default="fulltext_web") + sub_transform_es = subparsers.add_parser('transform-es', + help="transform fulltext JSON to elasticsearch schema JSON") + sub_transform_es.add_argument('json_file', + help="input JSON rows file (fulltext)", + type=argparse.FileType('r')) + sub_transform_es.add_argument('--json-output', + help="file to write to", + type=argparse.FileType('r'), + default=sys.stdout) + args = parser.parse_args() if args.action == 'webface': @@ -65,6 +77,8 @@ def main(): if args.action == 'derivatives': enrich_derivatives_file(args.json_file, args.json_output, args.base_dir) + if args.action == 'transform-es': + transform_es_file(args.json_file, args.json_output) else: print("tell me what to do!") sys.exit(-1) diff --git a/elastic_transform.py b/fatcat_covid19/transform.py index 04fba33..c31c9f4 100755..100644 --- a/elastic_transform.py +++ b/fatcat_covid19/transform.py @@ -1,9 +1,3 @@ -#!/usr/bin/env python3 - -""" -Takes *enriched* JSON objects which include fatcat metadata and fulltext -content, and outputs JSON lines in fatcat_fulltext schema. -""" import sys import json @@ -198,26 +192,13 @@ def fulltext_to_elasticsearch(row, force_bool=True): return t -def run(args): - for l in args.json_file: +def transform_es_file(json_input, json_output): + """ + Takes *enriched* JSON objects which include fatcat metadata and fulltext + content, and outputs JSON lines in fatcat_fulltext schema. + """ + for l in json_input: l = json.loads(l) result = fulltext_to_elasticsearch(l, args) if result: - print(json.dumps(result, sort_keys=True)) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="fulltext content input", - type=argparse.FileType('r')) - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - + print(json.dumps(result, sort_keys=True), file=json_output) |