#!/usr/bin/env python3

"""
Takes *enriched* JSON objects which include a fatcat_release key/entity, and
populate fulltext content and metadata.

This script *only* looks for existing local files.

Keys added:

- fulltext_status: whether we could fetch or not (always added)
- fulltext_file: fatcat file entity, plus
    - pdf_path
    - pdftotext_path (if exists)
    - thumbnail_path (if exists)
    - grobid_xml_path (if exists)
    - grobid_json_path (if exists)
- fulltext_grobid: grobid2json format, including:
    - title
    - authors
    - journal
    - abstract
    - body
    - acknowledgement
    - annex
    - language_code
    - glutton_fatcat_release (renamed from fatcat_release)
- fulltext_pdftotext: only if fulltext_grobid not set
    - body
"""

import sys
import json
import argparse
import datetime

from fatcat_covid19.common import *


def do_line(row, args):

    if 'fulltext_file' in row:
        return row
    if not 'fatcat_release' in row:
        row['fulltext_status'] = 'no-release'
        return row
    if not row['fatcat_release'].get('files'):
        row['fulltext_status'] = 'no-file'
        return row
    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
    if not fulltext_file:
        row['fulltext_status'] = 'no-local-file'
        return row
    else:
        row['fulltext_status'] = 'found'

    # ok, we have file, now populate derivatives etc
    fulltext_file['pdf_path'] = blob_path(
        fulltext_file['sha1'],
        directory="pdf/",
        file_suffix=".pdf",
        base_dir=args.base_dir,
    )
    fulltext_file['pdftotext_path'] = blob_path(
        fulltext_file['sha1'],
        directory="pdftotext/",
        file_suffix=".txt",
        base_dir=args.base_dir,
    )
    fulltext_file['thumbnail_path'] = blob_path(
        fulltext_file['sha1'],
        directory="thumbnail/",
        file_suffix=".png",
        base_dir=args.base_dir,
    )
    fulltext_file['grobid_xml_path'] = blob_path(
        fulltext_file['sha1'],
        directory="grobid/",
        file_suffix=".xml",
        base_dir=args.base_dir,
    )
    fulltext_file['grobid_json_path'] = blob_path(
        fulltext_file['sha1'],
        directory="grobid/",
        file_suffix=".json",
        base_dir=args.base_dir,
    )

    # check if derivatives actually exist
    for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
                'grobid_json_path'):
        if not os.path.isfile(fulltext_file[key]):
            fulltext_file[key] = None

    row['fulltext_file'] = fulltext_file

    # if there is no GROBID, try pdftotext
    if not fulltext_file['grobid_json_path']:

        if fulltext_file['pdftotext_path']:
            try:
                with open(fulltext_file['pdftotext_path'], 'r') as f:
                    row['fulltext_pdftotext'] = dict(body=f.read())
            except UnicodeDecodeError:
                row['fulltext_status'] = 'bad-unicode-pdftotext'
                return row
            row['fulltext_status'] = 'success-pdftotext'
            return row
        else:
            row['fulltext_status'] = 'no-extraction'
            return row

    with open(fulltext_file['grobid_json_path'], 'r') as f:
        grobid = json.loads(f.read())

    gfr = grobid.pop('fatcat_release', None)
    if gfr:
        grobid['glutton_fatcat_release'] = gfr
    row['fulltext_grobid'] = grobid
    row['fulltext_status'] = 'success-grobid'
    return row

def run(args):
    for l in args.json_file:
        l = json.loads(l)
        result = do_line(l, args)
        if result:
            print(json.dumps(result, sort_keys=True))

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('json_file',
        help="enriched (with fatcat_release) metadata file",
        type=argparse.FileType('r'))
    parser.add_argument('--base-dir',
        help="directory to look for files (in 'pdf' subdirectory)",
        default="fulltext_web")
    subparsers = parser.add_subparsers()

    args = parser.parse_args()
    args.session = requests_retry_session()

    run(args)

if __name__ == '__main__':
    main()