diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:01:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:01:19 -0700 |
commit | 913a3d7b2fe585602124e2092dd44afa760086e8 (patch) | |
tree | d754c7a70a42a400c21d2ea4f7fcd8703e032cad | |
parent | 9598a4c14800f8ec2543b26872565b1c3b9d2677 (diff) | |
download | fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.tar.gz fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.zip |
refactor derivatives into CLI tool
-rwxr-xr-x | covid19_tool.py | 21 | ||||
-rw-r--r--[-rwxr-xr-x] | fatcat_covid19/derivatives.py (renamed from cord19_fatcat_derivatives.py) | 106 |
2 files changed, 61 insertions, 66 deletions
diff --git a/covid19_tool.py b/covid19_tool.py index 7a565b8..23a2c6c 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -37,7 +37,7 @@ def main(): help="listen on this port") sub_enrich = subparsers.add_parser('enrich', - help="enrich CORD-19 dataset (JSON) with fatcat metadata (prints to stdout)") + help="enrich CORD-19 dataset (JSON) with fatcat metadata") sub_enrich.set_defaults( action='enrich', ) @@ -45,13 +45,26 @@ def main(): help="CORD-19 parsed JSON file", type=argparse.FileType('r')) + sub_derivatives = subparsers.add_parser('derivatives', + help="enrich JSON rows with existing derivative files") + sub_derivatives.add_argument('json_file', + help="enriched (with fatcat_release) metadata file", + type=argparse.FileType('r')) + sub_derivatives.add_argument('--json-output', + help="file to write ", + type=argparse.FileType('r'), + default=sys.stdout) + sub_derivatives.add_argument('--base-dir', + help="directory to look for files (in 'pdf' subdirectory)", + default="fulltext_web") + args = parser.parse_args() if args.action == 'webface': app.run(debug=args.debug, host=args.host, port=args.port) - if args.action == 'enrich': - # TODO - pass + if args.action == 'derivatives': + enrich_derivatives_file(args.json_file, args.json_output, + args.base_dir) else: print("tell me what to do!") sys.exit(-1) diff --git a/cord19_fatcat_derivatives.py b/fatcat_covid19/derivatives.py index 8b5b679..5ade0ef 100755..100644 --- a/cord19_fatcat_derivatives.py +++ b/fatcat_covid19/derivatives.py @@ -1,35 +1,3 @@ -#!/usr/bin/env python3 - -""" -Takes *enriched* JSON objects which include a fatcat_release key/entity, and -populate fulltext content and metadata. - -This script *only* looks for existing local files. - -Keys added: - -- fulltext_status: whether we could fetch or not (always added) -- fulltext_file: fatcat file entity, plus - - pdf_path - - pdftotext_path (if exists) - - thumbnail_path (if exists) - - grobid_xml_path (if exists) - - grobid_json_path (if exists) -- fulltext_grobid: grobid2json format, including: - - title - - authors - - journal - - abstract - - body - - acknowledgement - - annex - - language_code - - glutton_fatcat_release (renamed from fatcat_release) -- fulltext_pdftotext: only if fulltext_grobid not set - - body - -TODO: refactor into fatcat_covid19 module and CLI wrapper -""" import sys import json @@ -39,7 +7,35 @@ import datetime from fatcat_covid19.common import * -def do_line(row, args): +def enrich_derivatives_row(row, base_dir): + """ + Takes *enriched* JSON objects which include a fatcat_release key/entity, and + populate fulltext content and metadata. + + This script *only* looks for existing local files. + + Keys added: + + - fulltext_status: whether we could fetch or not (always added) + - fulltext_file: fatcat file entity, plus + - pdf_path + - pdftotext_path (if exists) + - thumbnail_path (if exists) + - grobid_xml_path (if exists) + - grobid_json_path (if exists) + - fulltext_grobid: grobid2json format, including: + - title + - authors + - journal + - abstract + - body + - acknowledgement + - annex + - language_code + - glutton_fatcat_release (renamed from fatcat_release) + - fulltext_pdftotext: only if fulltext_grobid not set + - body + """ if 'fulltext_file' in row: return row @@ -49,7 +45,7 @@ def do_line(row, args): if not row['fatcat_release'].get('files'): row['fulltext_status'] = 'no-file' return row - fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir) + fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir) if not fulltext_file: row['fulltext_status'] = 'no-local-file' return row @@ -61,31 +57,31 @@ def do_line(row, args): fulltext_file['sha1'], directory="pdf/", file_suffix=".pdf", - base_dir=args.base_dir, + base_dir=base_dir, ) fulltext_file['pdftotext_path'] = blob_path( fulltext_file['sha1'], directory="pdftotext/", file_suffix=".txt", - base_dir=args.base_dir, + base_dir=base_dir, ) fulltext_file['thumbnail_path'] = blob_path( fulltext_file['sha1'], directory="thumbnail/", file_suffix=".png", - base_dir=args.base_dir, + base_dir=base_dir, ) fulltext_file['grobid_xml_path'] = blob_path( fulltext_file['sha1'], directory="grobid/", file_suffix=".xml", - base_dir=args.base_dir, + base_dir=base_dir, ) fulltext_file['grobid_json_path'] = blob_path( fulltext_file['sha1'], directory="grobid/", file_suffix=".json", - base_dir=args.base_dir, + base_dir=base_dir, ) # check if derivatives actually exist @@ -122,29 +118,15 @@ def do_line(row, args): row['fulltext_status'] = 'success-grobid' return row -def run(args): - for l in args.json_file: +def enrich_derivatives_file(json_input, json_output, base_dir): + """ + Reads lines from json_input (an open, readable file or similar), looks for + existing derivative files in base_dir (a path str), and writes string JSON + lines to json_output (an open, writable file or similar). + """ + for l in json_input: l = json.loads(l) - result = do_line(l, args) + result = do_line(l, base_dir) if result: - print(json.dumps(result, sort_keys=True)) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="enriched (with fatcat_release) metadata file", - type=argparse.FileType('r')) - parser.add_argument('--base-dir', - help="directory to look for files (in 'pdf' subdirectory)", - default="fulltext_web") - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() + print(json.dumps(result, sort_keys=True), file=json_output) |