import sys import json import argparse import datetime from fatcat_covid19.common import * def enrich_derivatives_row(row, base_dir): """ Takes *enriched* JSON objects which include a fatcat_release key/entity, and populate fulltext content and metadata. This script *only* looks for existing local files. Keys added: - fulltext_status: whether we could fetch or not (always added) - fulltext_file: fatcat file entity, plus - pdf_path - pdftotext_path (if exists) - thumbnail_path (if exists) - grobid_xml_path (if exists) - grobid_json_path (if exists) - fulltext_grobid: grobid2json format, including: - title - authors - journal - abstract - body - acknowledgement - annex - language_code - glutton_fatcat_release (renamed from fatcat_release) - fulltext_pdftotext: only if fulltext_grobid not set - body """ if 'fulltext_file' in row: return row if not 'fatcat_release' in row: row['fulltext_status'] = 'no-release' return row if not row['fatcat_release'].get('files'): row['fulltext_status'] = 'no-file' return row fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir) if not fulltext_file: row['fulltext_status'] = 'no-local-file' return row else: row['fulltext_status'] = 'found' # ok, we have file, now populate derivatives etc fulltext_file['pdf_path'] = blob_path( fulltext_file['sha1'], directory="pdf/", file_suffix=".pdf", base_dir=base_dir, ) fulltext_file['pdftotext_path'] = blob_path( fulltext_file['sha1'], directory="pdftotext/", file_suffix=".txt", base_dir=base_dir, ) fulltext_file['thumbnail_path'] = blob_path( fulltext_file['sha1'], directory="thumbnail/", file_suffix=".png", base_dir=base_dir, ) fulltext_file['grobid_xml_path'] = blob_path( fulltext_file['sha1'], directory="grobid/", file_suffix=".xml", base_dir=base_dir, ) fulltext_file['grobid_json_path'] = blob_path( fulltext_file['sha1'], directory="grobid/", file_suffix=".json", base_dir=base_dir, ) # check if derivatives actually exist for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path', 'grobid_json_path'): if not os.path.isfile(fulltext_file[key]): fulltext_file[key] = None row['fulltext_file'] = fulltext_file # if there is no GROBID, try pdftotext if not fulltext_file['grobid_json_path']: if fulltext_file['pdftotext_path']: try: with open(fulltext_file['pdftotext_path'], 'r') as f: row['fulltext_pdftotext'] = dict(body=f.read()) except UnicodeDecodeError: row['fulltext_status'] = 'bad-unicode-pdftotext' return row row['fulltext_status'] = 'success-pdftotext' return row else: row['fulltext_status'] = 'no-extraction' return row with open(fulltext_file['grobid_json_path'], 'r') as f: grobid = json.loads(f.read()) gfr = grobid.pop('fatcat_release', None) if gfr: grobid['glutton_fatcat_release'] = gfr row['fulltext_grobid'] = grobid row['fulltext_status'] = 'success-grobid' return row def enrich_derivatives_file(json_input, json_output, base_dir): """ Reads lines from json_input (an open, readable file or similar), looks for existing derivative files in base_dir (a path str), and writes string JSON lines to json_output (an open, writable file or similar). """ for l in json_input: l = json.loads(l) result = enrich_derivatives_row(l, base_dir) if result: print(json.dumps(result, sort_keys=True), file=json_output)