refactor derivatives into CLI tool

author: Bryan Newbold <bnewbold@archive.org> 2020-04-03 15:01:19 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-04-03 15:01:19 -0700
commit: 913a3d7b2fe585602124e2092dd44afa760086e8 (patch)
tree: d754c7a70a42a400c21d2ea4f7fcd8703e032cad
parent: 9598a4c14800f8ec2543b26872565b1c3b9d2677 (diff)
download: fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.tar.gz
fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.zip
2 files changed, 61 insertions, 66 deletions
diff --git a/covid19_tool.py b/covid19_tool.py
index 7a565b8..23a2c6c 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -37,7 +37,7 @@ def main():
         help="listen on this port")
 
     sub_enrich = subparsers.add_parser('enrich',
-        help="enrich CORD-19 dataset (JSON) with fatcat metadata (prints to stdout)")
+        help="enrich CORD-19 dataset (JSON) with fatcat metadata")
     sub_enrich.set_defaults(
         action='enrich',
     )
@@ -45,13 +45,26 @@ def main():
         help="CORD-19 parsed JSON file",
         type=argparse.FileType('r'))
 
+    sub_derivatives = subparsers.add_parser('derivatives',
+        help="enrich JSON rows with existing derivative files")
+    sub_derivatives.add_argument('json_file',
+        help="enriched (with fatcat_release) metadata file",
+        type=argparse.FileType('r'))
+    sub_derivatives.add_argument('--json-output',
+        help="file to write ",
+        type=argparse.FileType('r'),
+        default=sys.stdout)
+    sub_derivatives.add_argument('--base-dir',
+        help="directory to look for files (in 'pdf' subdirectory)",
+        default="fulltext_web")
+
     args = parser.parse_args()
 
     if args.action == 'webface':
         app.run(debug=args.debug, host=args.host, port=args.port)
-    if args.action == 'enrich':
-        # TODO
-        pass
+    if args.action == 'derivatives':
+        enrich_derivatives_file(args.json_file, args.json_output,
+            args.base_dir)
     else:
         print("tell me what to do!")
         sys.exit(-1)
diff --git a/cord19_fatcat_derivatives.py b/fatcat_covid19/derivatives.py
index 8b5b679..5ade0ef 100755..100644
--- a/cord19_fatcat_derivatives.py
+++ b/fatcat_covid19/derivatives.py
@@ -1,35 +1,3 @@
-#!/usr/bin/env python3
-
-"""
-Takes *enriched* JSON objects which include a fatcat_release key/entity, and
-populate fulltext content and metadata.
-
-This script *only* looks for existing local files.
-
-Keys added:
-
-- fulltext_status: whether we could fetch or not (always added)
-- fulltext_file: fatcat file entity, plus
-    - pdf_path
-    - pdftotext_path (if exists)
-    - thumbnail_path (if exists)
-    - grobid_xml_path (if exists)
-    - grobid_json_path (if exists)
-- fulltext_grobid: grobid2json format, including:
-    - title
-    - authors
-    - journal
-    - abstract
-    - body
-    - acknowledgement
-    - annex
-    - language_code
-    - glutton_fatcat_release (renamed from fatcat_release)
-- fulltext_pdftotext: only if fulltext_grobid not set
-    - body
-
-TODO: refactor into fatcat_covid19 module and CLI wrapper
-"""
 
 import sys
 import json
@@ -39,7 +7,35 @@ import datetime
 from fatcat_covid19.common import *
 
 
-def do_line(row, args):
+def enrich_derivatives_row(row, base_dir):
+    """
+    Takes *enriched* JSON objects which include a fatcat_release key/entity, and
+    populate fulltext content and metadata.
+
+    This script *only* looks for existing local files.
+
+    Keys added:
+
+    - fulltext_status: whether we could fetch or not (always added)
+    - fulltext_file: fatcat file entity, plus
+        - pdf_path
+        - pdftotext_path (if exists)
+        - thumbnail_path (if exists)
+        - grobid_xml_path (if exists)
+        - grobid_json_path (if exists)
+    - fulltext_grobid: grobid2json format, including:
+        - title
+        - authors
+        - journal
+        - abstract
+        - body
+        - acknowledgement
+        - annex
+        - language_code
+        - glutton_fatcat_release (renamed from fatcat_release)
+    - fulltext_pdftotext: only if fulltext_grobid not set
+        - body
+    """
 
     if 'fulltext_file' in row:
         return row
@@ -49,7 +45,7 @@ def do_line(row, args):
     if not row['fatcat_release'].get('files'):
         row['fulltext_status'] = 'no-file'
         return row
-    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
+    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir)
     if not fulltext_file:
         row['fulltext_status'] = 'no-local-file'
         return row
@@ -61,31 +57,31 @@ def do_line(row, args):
         fulltext_file['sha1'],
         directory="pdf/",
         file_suffix=".pdf",
-        base_dir=args.base_dir,
+        base_dir=base_dir,
     )
     fulltext_file['pdftotext_path'] = blob_path(
         fulltext_file['sha1'],
         directory="pdftotext/",
         file_suffix=".txt",
-        base_dir=args.base_dir,
+        base_dir=base_dir,
     )
     fulltext_file['thumbnail_path'] = blob_path(
         fulltext_file['sha1'],
         directory="thumbnail/",
         file_suffix=".png",
-        base_dir=args.base_dir,
+        base_dir=base_dir,
     )
     fulltext_file['grobid_xml_path'] = blob_path(
         fulltext_file['sha1'],
         directory="grobid/",
         file_suffix=".xml",
-        base_dir=args.base_dir,
+        base_dir=base_dir,
     )
     fulltext_file['grobid_json_path'] = blob_path(
         fulltext_file['sha1'],
         directory="grobid/",
         file_suffix=".json",
-        base_dir=args.base_dir,
+        base_dir=base_dir,
     )
 
     # check if derivatives actually exist
@@ -122,29 +118,15 @@ def do_line(row, args):
     row['fulltext_status'] = 'success-grobid'
     return row
 
-def run(args):
-    for l in args.json_file:
+def enrich_derivatives_file(json_input, json_output, base_dir):
+    """
+    Reads lines from json_input (an open, readable file or similar), looks for
+    existing derivative files in base_dir (a path str), and writes string JSON
+    lines to json_output (an open, writable file or similar).
+    """
+    for l in json_input:
         l = json.loads(l)
-        result = do_line(l, args)
+        result = do_line(l, base_dir)
         if result:
-            print(json.dumps(result, sort_keys=True))
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-        help="enriched (with fatcat_release) metadata file",
-        type=argparse.FileType('r'))
-    parser.add_argument('--base-dir',
-        help="directory to look for files (in 'pdf' subdirectory)",
-        default="fulltext_web")
-    subparsers = parser.add_subparsers()
-
-    args = parser.parse_args()
-    args.session = requests_retry_session()
-
-    run(args)
-
-if __name__ == '__main__':
-    main()
+            print(json.dumps(result, sort_keys=True), file=json_output)
author	Bryan Newbold <bnewbold@archive.org>	2020-04-03 15:01:19 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-04-03 15:01:19 -0700
commit	913a3d7b2fe585602124e2092dd44afa760086e8 (patch)
tree	d754c7a70a42a400c21d2ea4f7fcd8703e032cad
parent	9598a4c14800f8ec2543b26872565b1c3b9d2677 (diff)
download	fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.tar.gz fatcat-covid19-913a3d7b2fe585602124e2092dd44afa760086e8.zip