diff options
Diffstat (limited to 'cord19_fatcat_derivatives.py')
-rwxr-xr-x | cord19_fatcat_derivatives.py | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py new file mode 100755 index 0000000..aa0382b --- /dev/null +++ b/cord19_fatcat_derivatives.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +""" +Takes *enriched* JSON objects which include a fatcat_release key/entity, and +populate fulltext content and metadata. + +This script *only* looks for existing local files. + +Keys added: + +- fulltext_status: whether we could fetch or not (always added) +- fulltext_file: fatcat file entity, plus + - pdf_path + - pdftotext_path (if exists) + - thumbnail_path (if exists) + - grobid_xml_path (if exists) + - grobid_json_path (if exists) +- fulltext_grobid: grobid2json format, including: + - title + - authors + - journal + - abstract + - body + - acknowledgement + - annex + - language_code + - glutton_fatcat_release (renamed from fatcat_release) +- fulltext_pdftotext: only if fulltext_grobid not set + - body +""" + +import sys +import json +import argparse +import datetime + +from fatcat_covid19.common import * + + +def do_line(row, args): + + if 'fulltext_file' in row: + return row + if not 'fatcat_release' in row: + row['fulltext_status'] = 'no-release' + return row + if not row['fatcat_release'].get('files'): + row['fulltext_status'] = 'no-file' + return row + fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir) + if not fulltext_file: + row['fulltext_status'] = 'no-local-file' + return row + else: + row['fulltext_status'] = 'found' + + # ok, we have file, now populate derivatives etc + fulltext_file['pdf_path'] = blob_path( + fulltext_file['sha1'], + directory="pdf/", + file_suffix=".pdf", + base_dir=args.base_dir, + ) + fulltext_file['pdftotext_path'] = blob_path( + fulltext_file['sha1'], + directory="pdftotext/", + file_suffix=".txt", + base_dir=args.base_dir, + ) + fulltext_file['thumbnail_path'] = blob_path( + fulltext_file['sha1'], + directory="thumbnail/", + file_suffix=".png", + base_dir=args.base_dir, + ) + fulltext_file['grobid_xml_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".xml", + base_dir=args.base_dir, + ) + fulltext_file['grobid_json_path'] = blob_path( + fulltext_file['sha1'], + directory="grobid/", + file_suffix=".json", + base_dir=args.base_dir, + ) + + # check if derivatives actually exist + for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path', + 'grobid_json_path'): + if not os.path.isfile(fulltext_file[key]): + fulltext_file[key] = None + + row['fulltext_file'] = fulltext_file + + # if there is no GROBID, try pdftotext + if not fulltext_file['grobid_json_path']: + + if fulltext_file['pdftotext_path']: + try: + with open(fulltext_file['pdftotext_path'], 'r') as f: + row['fulltext_pdftotext'] = dict(body=f.read()) + except UnicodeDecodeError: + row['fulltext_status'] = 'bad-unicode-pdftotext' + return row + row['fulltext_status'] = 'success-pdftotext' + return row + else: + row['fulltext_status'] = 'no-extraction' + return row + + with open(fulltext_file['grobid_json_path'], 'r') as f: + grobid = json.loads(f.read()) + + gfr = grobid.pop('fatcat_release', None) + if gfr: + grobid['glutton_fatcat_release'] = gfr + row['fulltext_grobid'] = grobid + row['fulltext_status'] = 'success-grobid' + return row + +def run(args): + for l in args.json_file: + l = json.loads(l) + result = do_line(l, args) + if result: + print(json.dumps(result, sort_keys=True)) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="enriched (with fatcat_release) metadata file", + type=argparse.FileType('r')) + parser.add_argument('--base-dir', + help="directory to look for files (in 'pdf' subdirectory)", + default="fulltext_web") + subparsers = parser.add_subparsers() + + args = parser.parse_args() + args.session = requests_retry_session() + + run(args) + +if __name__ == '__main__': + main() + |