From 913a3d7b2fe585602124e2092dd44afa760086e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 15:01:19 -0700 Subject: refactor derivatives into CLI tool --- cord19_fatcat_derivatives.py | 150 ------------------------------------------ covid19_tool.py | 21 ++++-- fatcat_covid19/derivatives.py | 132 +++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 154 deletions(-) delete mode 100755 cord19_fatcat_derivatives.py create mode 100644 fatcat_covid19/derivatives.py diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py deleted file mode 100755 index 8b5b679..0000000 --- a/cord19_fatcat_derivatives.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 - -""" -Takes *enriched* JSON objects which include a fatcat_release key/entity, and -populate fulltext content and metadata. - -This script *only* looks for existing local files. - -Keys added: - -- fulltext_status: whether we could fetch or not (always added) -- fulltext_file: fatcat file entity, plus - - pdf_path - - pdftotext_path (if exists) - - thumbnail_path (if exists) - - grobid_xml_path (if exists) - - grobid_json_path (if exists) -- fulltext_grobid: grobid2json format, including: - - title - - authors - - journal - - abstract - - body - - acknowledgement - - annex - - language_code - - glutton_fatcat_release (renamed from fatcat_release) -- fulltext_pdftotext: only if fulltext_grobid not set - - body - -TODO: refactor into fatcat_covid19 module and CLI wrapper -""" - -import sys -import json -import argparse -import datetime - -from fatcat_covid19.common import * - - -def do_line(row, args): - - if 'fulltext_file' in row: - return row - if not 'fatcat_release' in row: - row['fulltext_status'] = 'no-release' - return row - if not row['fatcat_release'].get('files'): - row['fulltext_status'] = 'no-file' - return row - fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir) - if not fulltext_file: - row['fulltext_status'] = 'no-local-file' - return row - else: - row['fulltext_status'] = 'found' - - # ok, we have file, now populate derivatives etc - fulltext_file['pdf_path'] = blob_path( - fulltext_file['sha1'], - directory="pdf/", - file_suffix=".pdf", - base_dir=args.base_dir, - ) - fulltext_file['pdftotext_path'] = blob_path( - fulltext_file['sha1'], - directory="pdftotext/", - file_suffix=".txt", - base_dir=args.base_dir, - ) - fulltext_file['thumbnail_path'] = blob_path( - fulltext_file['sha1'], - directory="thumbnail/", - file_suffix=".png", - base_dir=args.base_dir, - ) - fulltext_file['grobid_xml_path'] = blob_path( - fulltext_file['sha1'], - directory="grobid/", - file_suffix=".xml", - base_dir=args.base_dir, - ) - fulltext_file['grobid_json_path'] = blob_path( - fulltext_file['sha1'], - directory="grobid/", - file_suffix=".json", - base_dir=args.base_dir, - ) - - # check if derivatives actually exist - for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path', - 'grobid_json_path'): - if not os.path.isfile(fulltext_file[key]): - fulltext_file[key] = None - - row['fulltext_file'] = fulltext_file - - # if there is no GROBID, try pdftotext - if not fulltext_file['grobid_json_path']: - - if fulltext_file['pdftotext_path']: - try: - with open(fulltext_file['pdftotext_path'], 'r') as f: - row['fulltext_pdftotext'] = dict(body=f.read()) - except UnicodeDecodeError: - row['fulltext_status'] = 'bad-unicode-pdftotext' - return row - row['fulltext_status'] = 'success-pdftotext' - return row - else: - row['fulltext_status'] = 'no-extraction' - return row - - with open(fulltext_file['grobid_json_path'], 'r') as f: - grobid = json.loads(f.read()) - - gfr = grobid.pop('fatcat_release', None) - if gfr: - grobid['glutton_fatcat_release'] = gfr - row['fulltext_grobid'] = grobid - row['fulltext_status'] = 'success-grobid' - return row - -def run(args): - for l in args.json_file: - l = json.loads(l) - result = do_line(l, args) - if result: - print(json.dumps(result, sort_keys=True)) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="enriched (with fatcat_release) metadata file", - type=argparse.FileType('r')) - parser.add_argument('--base-dir', - help="directory to look for files (in 'pdf' subdirectory)", - default="fulltext_web") - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - diff --git a/covid19_tool.py b/covid19_tool.py index 7a565b8..23a2c6c 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -37,7 +37,7 @@ def main(): help="listen on this port") sub_enrich = subparsers.add_parser('enrich', - help="enrich CORD-19 dataset (JSON) with fatcat metadata (prints to stdout)") + help="enrich CORD-19 dataset (JSON) with fatcat metadata") sub_enrich.set_defaults( action='enrich', ) @@ -45,13 +45,26 @@ def main(): help="CORD-19 parsed JSON file", type=argparse.FileType('r')) + sub_derivatives = subparsers.add_parser('derivatives', + help="enrich JSON rows with existing derivative files") + sub_derivatives.add_argument('json_file', + help="enriched (with fatcat_release) metadata file", + type=argparse.FileType('r')) + sub_derivatives.add_argument('--json-output', + help="file to write ", + type=argparse.FileType('r'), + default=sys.stdout) + sub_derivatives.add_argument('--base-dir', + help="directory to look for files (in 'pdf' subdirectory)", + default="fulltext_web") + args = parser.parse_args() if args.action == 'webface': app.run(debug=args.debug, host=args.host, port=args.port) - if args.action == 'enrich': - # TODO - pass + if args.action == 'derivatives': + enrich_derivatives_file(args.json_file, args.json_output, + args.base_dir) else: print("tell me what to do!") sys.exit(-1) diff --git a/fatcat_covid19/derivatives.py b/fatcat_covid19/derivatives.py new file mode 100644 index 0000000..5ade0ef --- /dev/null +++ b/fatcat_covid19/derivatives.py @@ -0,0 +1,132 @@ + +import sys +import json +import argparse +import datetime + +from fatcat_covid19.common import * + + +def enrich_derivatives_row(row, base_dir): + """ + Takes *enriched* JSON objects which include a fatcat_release key/entity, and + populate fulltext content and metadata. + + This script *only* looks for existing local files. + + Keys added: + + - fulltext_status: whether we could fetch or not (always added) + - fulltext_file: fatcat file entity, plus + - pdf_path + - pdftotext_path (if exists) + - thumbnail_path (if exists) + - grobid_xml_path (if exists) + - grobid_json_path (if exists) + - fulltext_grobid: grobid2json format, including: + - title + - authors + - journal + - abstract + - body + - acknowledgement + - annex + - language_code + - glutton_fatcat_release (renamed from fatcat_release) + - fulltext_pdftotext: only if fulltext_grobid not set + - body + """ + + if 'fulltext_file' in row: + return row + if not 'fatcat_release' in row: + row['fulltext_status'] = 'no-release' + return row + if not row['fatcat_release'].get('files'): + row['fulltext_status'] = 'no-file' + return row + fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir) + if not fulltext_file: + row['fulltext_status'] = 'no-local-file' + return row + else: + row['fulltext_status'] = 'found' + + # ok, we have file, now populate derivatives etc + fulltext_file['pdf_path'] = blob_path( + fulltext_file['sha1'], + directory="pdf/", + file_suffix=".pdf", + base_dir=base_dir, + ) + fulltext_file['pdftotext_path'] = blob_path( + fulltext_file['sha1'], + directory="pdftotext/", + file_suffix=".txt", + base_dir=base_dir, + ) + fulltext_file['thumbnail_path'] = blob_path( + fulltext_file['sha1'], + directory="thumbnail/", + file_suffix=".png", + base_dir=base_dir, + ) + fulltext_file['grobid_xml_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".xml", + base_dir=base_dir, + ) + fulltext_file['grobid_json_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".json", + base_dir=base_dir, + ) + + # check if derivatives actually exist + for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path', + 'grobid_json_path'): + if not os.path.isfile(fulltext_file[key]): + fulltext_file[key] = None + + row['fulltext_file'] = fulltext_file + + # if there is no GROBID, try pdftotext + if not fulltext_file['grobid_json_path']: + + if fulltext_file['pdftotext_path']: + try: + with open(fulltext_file['pdftotext_path'], 'r') as f: + row['fulltext_pdftotext'] = dict(body=f.read()) + except UnicodeDecodeError: + row['fulltext_status'] = 'bad-unicode-pdftotext' + return row + row['fulltext_status'] = 'success-pdftotext' + return row + else: + row['fulltext_status'] = 'no-extraction' + return row + + with open(fulltext_file['grobid_json_path'], 'r') as f: + grobid = json.loads(f.read()) + + gfr = grobid.pop('fatcat_release', None) + if gfr: + grobid['glutton_fatcat_release'] = gfr + row['fulltext_grobid'] = grobid + row['fulltext_status'] = 'success-grobid' + return row + +def enrich_derivatives_file(json_input, json_output, base_dir): + """ + Reads lines from json_input (an open, readable file or similar), looks for + existing derivative files in base_dir (a path str), and writes string JSON + lines to json_output (an open, writable file or similar). + """ + for l in json_input: + l = json.loads(l) + result = do_line(l, base_dir) + if result: + print(json.dumps(result, sort_keys=True), file=json_output) + -- cgit v1.2.3