From 913a3d7b2fe585602124e2092dd44afa760086e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 15:01:19 -0700 Subject: refactor derivatives into CLI tool --- fatcat_covid19/derivatives.py | 132 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 fatcat_covid19/derivatives.py (limited to 'fatcat_covid19') diff --git a/fatcat_covid19/derivatives.py b/fatcat_covid19/derivatives.py new file mode 100644 index 0000000..5ade0ef --- /dev/null +++ b/fatcat_covid19/derivatives.py @@ -0,0 +1,132 @@ + +import sys +import json +import argparse +import datetime + +from fatcat_covid19.common import * + + +def enrich_derivatives_row(row, base_dir): + """ + Takes *enriched* JSON objects which include a fatcat_release key/entity, and + populate fulltext content and metadata. + + This script *only* looks for existing local files. + + Keys added: + + - fulltext_status: whether we could fetch or not (always added) + - fulltext_file: fatcat file entity, plus + - pdf_path + - pdftotext_path (if exists) + - thumbnail_path (if exists) + - grobid_xml_path (if exists) + - grobid_json_path (if exists) + - fulltext_grobid: grobid2json format, including: + - title + - authors + - journal + - abstract + - body + - acknowledgement + - annex + - language_code + - glutton_fatcat_release (renamed from fatcat_release) + - fulltext_pdftotext: only if fulltext_grobid not set + - body + """ + + if 'fulltext_file' in row: + return row + if not 'fatcat_release' in row: + row['fulltext_status'] = 'no-release' + return row + if not row['fatcat_release'].get('files'): + row['fulltext_status'] = 'no-file' + return row + fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir) + if not fulltext_file: + row['fulltext_status'] = 'no-local-file' + return row + else: + row['fulltext_status'] = 'found' + + # ok, we have file, now populate derivatives etc + fulltext_file['pdf_path'] = blob_path( + fulltext_file['sha1'], + directory="pdf/", + file_suffix=".pdf", + base_dir=base_dir, + ) + fulltext_file['pdftotext_path'] = blob_path( + fulltext_file['sha1'], + directory="pdftotext/", + file_suffix=".txt", + base_dir=base_dir, + ) + fulltext_file['thumbnail_path'] = blob_path( + fulltext_file['sha1'], + directory="thumbnail/", + file_suffix=".png", + base_dir=base_dir, + ) + fulltext_file['grobid_xml_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".xml", + base_dir=base_dir, + ) + fulltext_file['grobid_json_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".json", + base_dir=base_dir, + ) + + # check if derivatives actually exist + for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path', + 'grobid_json_path'): + if not os.path.isfile(fulltext_file[key]): + fulltext_file[key] = None + + row['fulltext_file'] = fulltext_file + + # if there is no GROBID, try pdftotext + if not fulltext_file['grobid_json_path']: + + if fulltext_file['pdftotext_path']: + try: + with open(fulltext_file['pdftotext_path'], 'r') as f: + row['fulltext_pdftotext'] = dict(body=f.read()) + except UnicodeDecodeError: + row['fulltext_status'] = 'bad-unicode-pdftotext' + return row + row['fulltext_status'] = 'success-pdftotext' + return row + else: + row['fulltext_status'] = 'no-extraction' + return row + + with open(fulltext_file['grobid_json_path'], 'r') as f: + grobid = json.loads(f.read()) + + gfr = grobid.pop('fatcat_release', None) + if gfr: + grobid['glutton_fatcat_release'] = gfr + row['fulltext_grobid'] = grobid + row['fulltext_status'] = 'success-grobid' + return row + +def enrich_derivatives_file(json_input, json_output, base_dir): + """ + Reads lines from json_input (an open, readable file or similar), looks for + existing derivative files in base_dir (a path str), and writes string JSON + lines to json_output (an open, writable file or similar). + """ + for l in json_input: + l = json.loads(l) + result = do_line(l, base_dir) + if result: + print(json.dumps(result, sort_keys=True), file=json_output) + -- cgit v1.2.3