aboutsummaryrefslogtreecommitdiffstats
path: root/cord19_fatcat_derivatives.py
diff options
context:
space:
mode:
Diffstat (limited to 'cord19_fatcat_derivatives.py')
-rwxr-xr-xcord19_fatcat_derivatives.py148
1 files changed, 148 insertions, 0 deletions
diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py
new file mode 100755
index 0000000..aa0382b
--- /dev/null
+++ b/cord19_fatcat_derivatives.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+
+"""
+Takes *enriched* JSON objects which include a fatcat_release key/entity, and
+populate fulltext content and metadata.
+
+This script *only* looks for existing local files.
+
+Keys added:
+
+- fulltext_status: whether we could fetch or not (always added)
+- fulltext_file: fatcat file entity, plus
+ - pdf_path
+ - pdftotext_path (if exists)
+ - thumbnail_path (if exists)
+ - grobid_xml_path (if exists)
+ - grobid_json_path (if exists)
+- fulltext_grobid: grobid2json format, including:
+ - title
+ - authors
+ - journal
+ - abstract
+ - body
+ - acknowledgement
+ - annex
+ - language_code
+ - glutton_fatcat_release (renamed from fatcat_release)
+- fulltext_pdftotext: only if fulltext_grobid not set
+ - body
+"""
+
+import sys
+import json
+import argparse
+import datetime
+
+from fatcat_covid19.common import *
+
+
+def do_line(row, args):
+
+ if 'fulltext_file' in row:
+ return row
+ if not 'fatcat_release' in row:
+ row['fulltext_status'] = 'no-release'
+ return row
+ if not row['fatcat_release'].get('files'):
+ row['fulltext_status'] = 'no-file'
+ return row
+ fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
+ if not fulltext_file:
+ row['fulltext_status'] = 'no-local-file'
+ return row
+ else:
+ row['fulltext_status'] = 'found'
+
+ # ok, we have file, now populate derivatives etc
+ fulltext_file['pdf_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="pdf/",
+ file_suffix=".pdf",
+ base_dir=args.base_dir,
+ )
+ fulltext_file['pdftotext_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="pdftotext/",
+ file_suffix=".txt",
+ base_dir=args.base_dir,
+ )
+ fulltext_file['thumbnail_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="thumbnail/",
+ file_suffix=".png",
+ base_dir=args.base_dir,
+ )
+ fulltext_file['grobid_xml_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="grobid/",
+ file_suffix=".xml",
+ base_dir=args.base_dir,
+ )
+ fulltext_file['grobid_json_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="grobid/",
+ file_suffix=".json",
+ base_dir=args.base_dir,
+ )
+
+ # check if derivatives actually exist
+ for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
+ 'grobid_json_path'):
+ if not os.path.isfile(fulltext_file[key]):
+ fulltext_file[key] = None
+
+ row['fulltext_file'] = fulltext_file
+
+ # if there is no GROBID, try pdftotext
+ if not fulltext_file['grobid_json_path']:
+
+ if fulltext_file['pdftotext_path']:
+ try:
+ with open(fulltext_file['pdftotext_path'], 'r') as f:
+ row['fulltext_pdftotext'] = dict(body=f.read())
+ except UnicodeDecodeError:
+ row['fulltext_status'] = 'bad-unicode-pdftotext'
+ return row
+ row['fulltext_status'] = 'success-pdftotext'
+ return row
+ else:
+ row['fulltext_status'] = 'no-extraction'
+ return row
+
+ with open(fulltext_file['grobid_json_path'], 'r') as f:
+ grobid = json.loads(f.read())
+
+ gfr = grobid.pop('fatcat_release', None)
+ if gfr:
+ grobid['glutton_fatcat_release'] = gfr
+ row['fulltext_grobid'] = grobid
+ row['fulltext_status'] = 'success-grobid'
+ return row
+
+def run(args):
+ for l in args.json_file:
+ l = json.loads(l)
+ result = do_line(l, args)
+ if result:
+ print(json.dumps(result, sort_keys=True))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="enriched (with fatcat_release) metadata file",
+ type=argparse.FileType('r'))
+ parser.add_argument('--base-dir',
+ help="directory to look for files (in 'pdf' subdirectory)",
+ default="fulltext_web")
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+ args.session = requests_retry_session()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
+