aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 15:01:19 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 15:01:19 -0700
commit913a3d7b2fe585602124e2092dd44afa760086e8 (patch)
treed754c7a70a42a400c21d2ea4f7fcd8703e032cad
parent9598a4c14800f8ec2543b26872565b1c3b9d2677 (diff)
downloadfatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.tar.gz
fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.zip
refactor derivatives into CLI tool
-rwxr-xr-xcovid19_tool.py21
-rw-r--r--[-rwxr-xr-x]fatcat_covid19/derivatives.py (renamed from cord19_fatcat_derivatives.py)106
2 files changed, 61 insertions, 66 deletions
diff --git a/covid19_tool.py b/covid19_tool.py
index 7a565b8..23a2c6c 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -37,7 +37,7 @@ def main():
help="listen on this port")
sub_enrich = subparsers.add_parser('enrich',
- help="enrich CORD-19 dataset (JSON) with fatcat metadata (prints to stdout)")
+ help="enrich CORD-19 dataset (JSON) with fatcat metadata")
sub_enrich.set_defaults(
action='enrich',
)
@@ -45,13 +45,26 @@ def main():
help="CORD-19 parsed JSON file",
type=argparse.FileType('r'))
+ sub_derivatives = subparsers.add_parser('derivatives',
+ help="enrich JSON rows with existing derivative files")
+ sub_derivatives.add_argument('json_file',
+ help="enriched (with fatcat_release) metadata file",
+ type=argparse.FileType('r'))
+ sub_derivatives.add_argument('--json-output',
+ help="file to write ",
+ type=argparse.FileType('r'),
+ default=sys.stdout)
+ sub_derivatives.add_argument('--base-dir',
+ help="directory to look for files (in 'pdf' subdirectory)",
+ default="fulltext_web")
+
args = parser.parse_args()
if args.action == 'webface':
app.run(debug=args.debug, host=args.host, port=args.port)
- if args.action == 'enrich':
- # TODO
- pass
+ if args.action == 'derivatives':
+ enrich_derivatives_file(args.json_file, args.json_output,
+ args.base_dir)
else:
print("tell me what to do!")
sys.exit(-1)
diff --git a/cord19_fatcat_derivatives.py b/fatcat_covid19/derivatives.py
index 8b5b679..5ade0ef 100755..100644
--- a/cord19_fatcat_derivatives.py
+++ b/fatcat_covid19/derivatives.py
@@ -1,35 +1,3 @@
-#!/usr/bin/env python3
-
-"""
-Takes *enriched* JSON objects which include a fatcat_release key/entity, and
-populate fulltext content and metadata.
-
-This script *only* looks for existing local files.
-
-Keys added:
-
-- fulltext_status: whether we could fetch or not (always added)
-- fulltext_file: fatcat file entity, plus
- - pdf_path
- - pdftotext_path (if exists)
- - thumbnail_path (if exists)
- - grobid_xml_path (if exists)
- - grobid_json_path (if exists)
-- fulltext_grobid: grobid2json format, including:
- - title
- - authors
- - journal
- - abstract
- - body
- - acknowledgement
- - annex
- - language_code
- - glutton_fatcat_release (renamed from fatcat_release)
-- fulltext_pdftotext: only if fulltext_grobid not set
- - body
-
-TODO: refactor into fatcat_covid19 module and CLI wrapper
-"""
import sys
import json
@@ -39,7 +7,35 @@ import datetime
from fatcat_covid19.common import *
-def do_line(row, args):
+def enrich_derivatives_row(row, base_dir):
+ """
+ Takes *enriched* JSON objects which include a fatcat_release key/entity, and
+ populate fulltext content and metadata.
+
+ This script *only* looks for existing local files.
+
+ Keys added:
+
+ - fulltext_status: whether we could fetch or not (always added)
+ - fulltext_file: fatcat file entity, plus
+ - pdf_path
+ - pdftotext_path (if exists)
+ - thumbnail_path (if exists)
+ - grobid_xml_path (if exists)
+ - grobid_json_path (if exists)
+ - fulltext_grobid: grobid2json format, including:
+ - title
+ - authors
+ - journal
+ - abstract
+ - body
+ - acknowledgement
+ - annex
+ - language_code
+ - glutton_fatcat_release (renamed from fatcat_release)
+ - fulltext_pdftotext: only if fulltext_grobid not set
+ - body
+ """
if 'fulltext_file' in row:
return row
@@ -49,7 +45,7 @@ def do_line(row, args):
if not row['fatcat_release'].get('files'):
row['fulltext_status'] = 'no-file'
return row
- fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
+ fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir)
if not fulltext_file:
row['fulltext_status'] = 'no-local-file'
return row
@@ -61,31 +57,31 @@ def do_line(row, args):
fulltext_file['sha1'],
directory="pdf/",
file_suffix=".pdf",
- base_dir=args.base_dir,
+ base_dir=base_dir,
)
fulltext_file['pdftotext_path'] = blob_path(
fulltext_file['sha1'],
directory="pdftotext/",
file_suffix=".txt",
- base_dir=args.base_dir,
+ base_dir=base_dir,
)
fulltext_file['thumbnail_path'] = blob_path(
fulltext_file['sha1'],
directory="thumbnail/",
file_suffix=".png",
- base_dir=args.base_dir,
+ base_dir=base_dir,
)
fulltext_file['grobid_xml_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".xml",
- base_dir=args.base_dir,
+ base_dir=base_dir,
)
fulltext_file['grobid_json_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".json",
- base_dir=args.base_dir,
+ base_dir=base_dir,
)
# check if derivatives actually exist
@@ -122,29 +118,15 @@ def do_line(row, args):
row['fulltext_status'] = 'success-grobid'
return row
-def run(args):
- for l in args.json_file:
+def enrich_derivatives_file(json_input, json_output, base_dir):
+ """
+ Reads lines from json_input (an open, readable file or similar), looks for
+ existing derivative files in base_dir (a path str), and writes string JSON
+ lines to json_output (an open, writable file or similar).
+ """
+ for l in json_input:
l = json.loads(l)
- result = do_line(l, args)
+ result = do_line(l, base_dir)
if result:
- print(json.dumps(result, sort_keys=True))
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="enriched (with fatcat_release) metadata file",
- type=argparse.FileType('r'))
- parser.add_argument('--base-dir',
- help="directory to look for files (in 'pdf' subdirectory)",
- default="fulltext_web")
- subparsers = parser.add_subparsers()
-
- args = parser.parse_args()
- args.session = requests_retry_session()
-
- run(args)
-
-if __name__ == '__main__':
- main()
+ print(json.dumps(result, sort_keys=True), file=json_output)