aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 15:01:19 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 15:01:19 -0700
commit913a3d7b2fe585602124e2092dd44afa760086e8 (patch)
treed754c7a70a42a400c21d2ea4f7fcd8703e032cad /fatcat_covid19
parent9598a4c14800f8ec2543b26872565b1c3b9d2677 (diff)
downloadfatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.tar.gz
fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.zip
refactor derivatives into CLI tool
Diffstat (limited to 'fatcat_covid19')
-rw-r--r--fatcat_covid19/derivatives.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/fatcat_covid19/derivatives.py b/fatcat_covid19/derivatives.py
new file mode 100644
index 0000000..5ade0ef
--- /dev/null
+++ b/fatcat_covid19/derivatives.py
@@ -0,0 +1,132 @@
+
+import sys
+import json
+import argparse
+import datetime
+
+from fatcat_covid19.common import *
+
+
+def enrich_derivatives_row(row, base_dir):
+ """
+ Takes *enriched* JSON objects which include a fatcat_release key/entity, and
+ populate fulltext content and metadata.
+
+ This script *only* looks for existing local files.
+
+ Keys added:
+
+ - fulltext_status: whether we could fetch or not (always added)
+ - fulltext_file: fatcat file entity, plus
+ - pdf_path
+ - pdftotext_path (if exists)
+ - thumbnail_path (if exists)
+ - grobid_xml_path (if exists)
+ - grobid_json_path (if exists)
+ - fulltext_grobid: grobid2json format, including:
+ - title
+ - authors
+ - journal
+ - abstract
+ - body
+ - acknowledgement
+ - annex
+ - language_code
+ - glutton_fatcat_release (renamed from fatcat_release)
+ - fulltext_pdftotext: only if fulltext_grobid not set
+ - body
+ """
+
+ if 'fulltext_file' in row:
+ return row
+ if not 'fatcat_release' in row:
+ row['fulltext_status'] = 'no-release'
+ return row
+ if not row['fatcat_release'].get('files'):
+ row['fulltext_status'] = 'no-file'
+ return row
+ fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir)
+ if not fulltext_file:
+ row['fulltext_status'] = 'no-local-file'
+ return row
+ else:
+ row['fulltext_status'] = 'found'
+
+ # ok, we have file, now populate derivatives etc
+ fulltext_file['pdf_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="pdf/",
+ file_suffix=".pdf",
+ base_dir=base_dir,
+ )
+ fulltext_file['pdftotext_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="pdftotext/",
+ file_suffix=".txt",
+ base_dir=base_dir,
+ )
+ fulltext_file['thumbnail_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="thumbnail/",
+ file_suffix=".png",
+ base_dir=base_dir,
+ )
+ fulltext_file['grobid_xml_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="grobid/",
+ file_suffix=".xml",
+ base_dir=base_dir,
+ )
+ fulltext_file['grobid_json_path'] = blob_path(
+ fulltext_file['sha1'],
+ directory="grobid/",
+ file_suffix=".json",
+ base_dir=base_dir,
+ )
+
+ # check if derivatives actually exist
+ for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
+ 'grobid_json_path'):
+ if not os.path.isfile(fulltext_file[key]):
+ fulltext_file[key] = None
+
+ row['fulltext_file'] = fulltext_file
+
+ # if there is no GROBID, try pdftotext
+ if not fulltext_file['grobid_json_path']:
+
+ if fulltext_file['pdftotext_path']:
+ try:
+ with open(fulltext_file['pdftotext_path'], 'r') as f:
+ row['fulltext_pdftotext'] = dict(body=f.read())
+ except UnicodeDecodeError:
+ row['fulltext_status'] = 'bad-unicode-pdftotext'
+ return row
+ row['fulltext_status'] = 'success-pdftotext'
+ return row
+ else:
+ row['fulltext_status'] = 'no-extraction'
+ return row
+
+ with open(fulltext_file['grobid_json_path'], 'r') as f:
+ grobid = json.loads(f.read())
+
+ gfr = grobid.pop('fatcat_release', None)
+ if gfr:
+ grobid['glutton_fatcat_release'] = gfr
+ row['fulltext_grobid'] = grobid
+ row['fulltext_status'] = 'success-grobid'
+ return row
+
+def enrich_derivatives_file(json_input, json_output, base_dir):
+ """
+ Reads lines from json_input (an open, readable file or similar), looks for
+ existing derivative files in base_dir (a path str), and writes string JSON
+ lines to json_output (an open, writable file or similar).
+ """
+ for l in json_input:
+ l = json.loads(l)
+ result = do_line(l, base_dir)
+ if result:
+ print(json.dumps(result, sort_keys=True), file=json_output)
+