From 913a3d7b2fe585602124e2092dd44afa760086e8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 3 Apr 2020 15:01:19 -0700
Subject: refactor derivatives into CLI tool

---
 cord19_fatcat_derivatives.py  | 150 ------------------------------------------
 covid19_tool.py               |  21 ++++--
 fatcat_covid19/derivatives.py | 132 +++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 154 deletions(-)
 delete mode 100755 cord19_fatcat_derivatives.py
 create mode 100644 fatcat_covid19/derivatives.py

diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py
deleted file mode 100755
index 8b5b679..0000000
--- a/cord19_fatcat_derivatives.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Takes *enriched* JSON objects which include a fatcat_release key/entity, and
-populate fulltext content and metadata.
-
-This script *only* looks for existing local files.
-
-Keys added:
-
-- fulltext_status: whether we could fetch or not (always added)
-- fulltext_file: fatcat file entity, plus
-    - pdf_path
-    - pdftotext_path (if exists)
-    - thumbnail_path (if exists)
-    - grobid_xml_path (if exists)
-    - grobid_json_path (if exists)
-- fulltext_grobid: grobid2json format, including:
-    - title
-    - authors
-    - journal
-    - abstract
-    - body
-    - acknowledgement
-    - annex
-    - language_code
-    - glutton_fatcat_release (renamed from fatcat_release)
-- fulltext_pdftotext: only if fulltext_grobid not set
-    - body
-
-TODO: refactor into fatcat_covid19 module and CLI wrapper
-"""
-
-import sys
-import json
-import argparse
-import datetime
-
-from fatcat_covid19.common import *
-
-
-def do_line(row, args):
-
-    if 'fulltext_file' in row:
-        return row
-    if not 'fatcat_release' in row:
-        row['fulltext_status'] = 'no-release'
-        return row
-    if not row['fatcat_release'].get('files'):
-        row['fulltext_status'] = 'no-file'
-        return row
-    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
-    if not fulltext_file:
-        row['fulltext_status'] = 'no-local-file'
-        return row
-    else:
-        row['fulltext_status'] = 'found'
-
-    # ok, we have file, now populate derivatives etc
-    fulltext_file['pdf_path'] = blob_path(
-        fulltext_file['sha1'],
-        directory="pdf/",
-        file_suffix=".pdf",
-        base_dir=args.base_dir,
-    )
-    fulltext_file['pdftotext_path'] = blob_path(
-        fulltext_file['sha1'],
-        directory="pdftotext/",
-        file_suffix=".txt",
-        base_dir=args.base_dir,
-    )
-    fulltext_file['thumbnail_path'] = blob_path(
-        fulltext_file['sha1'],
-        directory="thumbnail/",
-        file_suffix=".png",
-        base_dir=args.base_dir,
-    )
-    fulltext_file['grobid_xml_path'] = blob_path(
-        fulltext_file['sha1'],
-        directory="grobid/",
-        file_suffix=".xml",
-        base_dir=args.base_dir,
-    )
-    fulltext_file['grobid_json_path'] = blob_path(
-        fulltext_file['sha1'],
-        directory="grobid/",
-        file_suffix=".json",
-        base_dir=args.base_dir,
-    )
-
-    # check if derivatives actually exist
-    for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
-                'grobid_json_path'):
-        if not os.path.isfile(fulltext_file[key]):
-            fulltext_file[key] = None
-
-    row['fulltext_file'] = fulltext_file
-
-    # if there is no GROBID, try pdftotext
-    if not fulltext_file['grobid_json_path']:
-
-        if fulltext_file['pdftotext_path']:
-            try:
-                with open(fulltext_file['pdftotext_path'], 'r') as f:
-                    row['fulltext_pdftotext'] = dict(body=f.read())
-            except UnicodeDecodeError:
-                row['fulltext_status'] = 'bad-unicode-pdftotext'
-                return row
-            row['fulltext_status'] = 'success-pdftotext'
-            return row
-        else:
-            row['fulltext_status'] = 'no-extraction'
-            return row
-
-    with open(fulltext_file['grobid_json_path'], 'r') as f:
-        grobid = json.loads(f.read())
-
-    gfr = grobid.pop('fatcat_release', None)
-    if gfr:
-        grobid['glutton_fatcat_release'] = gfr
-    row['fulltext_grobid'] = grobid
-    row['fulltext_status'] = 'success-grobid'
-    return row
-
-def run(args):
-    for l in args.json_file:
-        l = json.loads(l)
-        result = do_line(l, args)
-        if result:
-            print(json.dumps(result, sort_keys=True))
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-        help="enriched (with fatcat_release) metadata file",
-        type=argparse.FileType('r'))
-    parser.add_argument('--base-dir',
-        help="directory to look for files (in 'pdf' subdirectory)",
-        default="fulltext_web")
-    subparsers = parser.add_subparsers()
-
-    args = parser.parse_args()
-    args.session = requests_retry_session()
-
-    run(args)
-
-if __name__ == '__main__':
-    main()
-
diff --git a/covid19_tool.py b/covid19_tool.py
index 7a565b8..23a2c6c 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -37,7 +37,7 @@ def main():
         help="listen on this port")
 
     sub_enrich = subparsers.add_parser('enrich',
-        help="enrich CORD-19 dataset (JSON) with fatcat metadata (prints to stdout)")
+        help="enrich CORD-19 dataset (JSON) with fatcat metadata")
     sub_enrich.set_defaults(
         action='enrich',
     )
@@ -45,13 +45,26 @@ def main():
         help="CORD-19 parsed JSON file",
         type=argparse.FileType('r'))
 
+    sub_derivatives = subparsers.add_parser('derivatives',
+        help="enrich JSON rows with existing derivative files")
+    sub_derivatives.add_argument('json_file',
+        help="enriched (with fatcat_release) metadata file",
+        type=argparse.FileType('r'))
+    sub_derivatives.add_argument('--json-output',
+        help="file to write ",
+        type=argparse.FileType('r'),
+        default=sys.stdout)
+    sub_derivatives.add_argument('--base-dir',
+        help="directory to look for files (in 'pdf' subdirectory)",
+        default="fulltext_web")
+
     args = parser.parse_args()
 
     if args.action == 'webface':
         app.run(debug=args.debug, host=args.host, port=args.port)
-    if args.action == 'enrich':
-        # TODO
-        pass
+    if args.action == 'derivatives':
+        enrich_derivatives_file(args.json_file, args.json_output,
+            args.base_dir)
     else:
         print("tell me what to do!")
         sys.exit(-1)
diff --git a/fatcat_covid19/derivatives.py b/fatcat_covid19/derivatives.py
new file mode 100644
index 0000000..5ade0ef
--- /dev/null
+++ b/fatcat_covid19/derivatives.py
@@ -0,0 +1,132 @@
+
+import sys
+import json
+import argparse
+import datetime
+
+from fatcat_covid19.common import *
+
+
+def enrich_derivatives_row(row, base_dir):
+    """
+    Takes *enriched* JSON objects which include a fatcat_release key/entity, and
+    populate fulltext content and metadata.
+
+    This script *only* looks for existing local files.
+
+    Keys added:
+
+    - fulltext_status: whether we could fetch or not (always added)
+    - fulltext_file: fatcat file entity, plus
+        - pdf_path
+        - pdftotext_path (if exists)
+        - thumbnail_path (if exists)
+        - grobid_xml_path (if exists)
+        - grobid_json_path (if exists)
+    - fulltext_grobid: grobid2json format, including:
+        - title
+        - authors
+        - journal
+        - abstract
+        - body
+        - acknowledgement
+        - annex
+        - language_code
+        - glutton_fatcat_release (renamed from fatcat_release)
+    - fulltext_pdftotext: only if fulltext_grobid not set
+        - body
+    """
+
+    if 'fulltext_file' in row:
+        return row
+    if not 'fatcat_release' in row:
+        row['fulltext_status'] = 'no-release'
+        return row
+    if not row['fatcat_release'].get('files'):
+        row['fulltext_status'] = 'no-file'
+        return row
+    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir)
+    if not fulltext_file:
+        row['fulltext_status'] = 'no-local-file'
+        return row
+    else:
+        row['fulltext_status'] = 'found'
+
+    # ok, we have file, now populate derivatives etc
+    fulltext_file['pdf_path'] = blob_path(
+        fulltext_file['sha1'],
+        directory="pdf/",
+        file_suffix=".pdf",
+        base_dir=base_dir,
+    )
+    fulltext_file['pdftotext_path'] = blob_path(
+        fulltext_file['sha1'],
+        directory="pdftotext/",
+        file_suffix=".txt",
+        base_dir=base_dir,
+    )
+    fulltext_file['thumbnail_path'] = blob_path(
+        fulltext_file['sha1'],
+        directory="thumbnail/",
+        file_suffix=".png",
+        base_dir=base_dir,
+    )
+    fulltext_file['grobid_xml_path'] = blob_path(
+        fulltext_file['sha1'],
+        directory="grobid/",
+        file_suffix=".xml",
+        base_dir=base_dir,
+    )
+    fulltext_file['grobid_json_path'] = blob_path(
+        fulltext_file['sha1'],
+        directory="grobid/",
+        file_suffix=".json",
+        base_dir=base_dir,
+    )
+
+    # check if derivatives actually exist
+    for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
+                'grobid_json_path'):
+        if not os.path.isfile(fulltext_file[key]):
+            fulltext_file[key] = None
+
+    row['fulltext_file'] = fulltext_file
+
+    # if there is no GROBID, try pdftotext
+    if not fulltext_file['grobid_json_path']:
+
+        if fulltext_file['pdftotext_path']:
+            try:
+                with open(fulltext_file['pdftotext_path'], 'r') as f:
+                    row['fulltext_pdftotext'] = dict(body=f.read())
+            except UnicodeDecodeError:
+                row['fulltext_status'] = 'bad-unicode-pdftotext'
+                return row
+            row['fulltext_status'] = 'success-pdftotext'
+            return row
+        else:
+            row['fulltext_status'] = 'no-extraction'
+            return row
+
+    with open(fulltext_file['grobid_json_path'], 'r') as f:
+        grobid = json.loads(f.read())
+
+    gfr = grobid.pop('fatcat_release', None)
+    if gfr:
+        grobid['glutton_fatcat_release'] = gfr
+    row['fulltext_grobid'] = grobid
+    row['fulltext_status'] = 'success-grobid'
+    return row
+
+def enrich_derivatives_file(json_input, json_output, base_dir):
+    """
+    Reads lines from json_input (an open, readable file or similar), looks for
+    existing derivative files in base_dir (a path str), and writes string JSON
+    lines to json_output (an open, writable file or similar).
+    """
+    for l in json_input:
+        l = json.loads(l)
+        result = do_line(l, base_dir)
+        if result:
+            print(json.dumps(result, sort_keys=True), file=json_output)
+
-- 
cgit v1.2.3