cord19_fatcat_derivatives.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

#!/usr/bin/env python3

"""
Takes *enriched* JSON objects which include a fatcat_release key/entity, and
populate fulltext content and metadata.

This script *only* looks for existing local files.

Keys added:

- fulltext_status: whether we could fetch or not (always added)
- fulltext_file: fatcat file entity, plus
    - pdf_path
    - pdftotext_path (if exists)
    - thumbnail_path (if exists)
    - grobid_xml_path (if exists)
    - grobid_json_path (if exists)
- fulltext_grobid: grobid2json format, including:
    - title
    - authors
    - journal
    - abstract
    - body
    - acknowledgement
    - annex
    - language_code
    - glutton_fatcat_release (renamed from fatcat_release)
- fulltext_pdftotext: only if fulltext_grobid not set
    - body
"""

import sys
import json
import argparse
import datetime

from fatcat_covid19.common import *


def do_line(row, args):

    if 'fulltext_file' in row:
        return row
    if not 'fatcat_release' in row:
        row['fulltext_status'] = 'no-release'
        return row
    if not row['fatcat_release'].get('files'):
        row['fulltext_status'] = 'no-file'
        return row
    fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
    if not fulltext_file:
        row['fulltext_status'] = 'no-local-file'
        return row
    else:
        row['fulltext_status'] = 'found'

    # ok, we have file, now populate derivatives etc
    fulltext_file['pdf_path'] = blob_path(
        fulltext_file['sha1'],
        directory="pdf/",
        file_suffix=".pdf",
        base_dir=args.base_dir,
    )
    fulltext_file['pdftotext_path'] = blob_path(
        fulltext_file['sha1'],
        directory="pdftotext/",
        file_suffix=".txt",
        base_dir=args.base_dir,
    )
    fulltext_file['thumbnail_path'] = blob_path(
        fulltext_file['sha1'],
        directory="thumbnail/",
        file_suffix=".png",
        base_dir=args.base_dir,
    )
    fulltext_file['grobid_xml_path'] = blob_path(
        fulltext_file['sha1'],
        directory="grobid/",
        file_suffix=".xml",
        base_dir=args.base_dir,
    )
    fulltext_file['grobid_json_path'] = blob_path(
        fulltext_file['sha1'],
        directory="grobid/",
        file_suffix=".json",
        base_dir=args.base_dir,
    )

    # check if derivatives actually exist
    for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
                'grobid_json_path'):
        if not os.path.isfile(fulltext_file[key]):
            fulltext_file[key] = None

    row['fulltext_file'] = fulltext_file

    # if there is no GROBID, try pdftotext
    if not fulltext_file['grobid_json_path']:

        if fulltext_file['pdftotext_path']:
            try:
                with open(fulltext_file['pdftotext_path'], 'r') as f:
                    row['fulltext_pdftotext'] = dict(body=f.read())
            except UnicodeDecodeError:
                row['fulltext_status'] = 'bad-unicode-pdftotext'
                return row
            row['fulltext_status'] = 'success-pdftotext'
            return row
        else:
            row['fulltext_status'] = 'no-extraction'
            return row

    with open(fulltext_file['grobid_json_path'], 'r') as f:
        grobid = json.loads(f.read())

    gfr = grobid.pop('fatcat_release', None)
    if gfr:
        grobid['glutton_fatcat_release'] = gfr
    row['fulltext_grobid'] = grobid
    row['fulltext_status'] = 'success-grobid'
    return row

def run(args):
    for l in args.json_file:
        l = json.loads(l)
        result = do_line(l, args)
        if result:
            print(json.dumps(result, sort_keys=True))

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('json_file',
        help="enriched (with fatcat_release) metadata file",
        type=argparse.FileType('r'))
    parser.add_argument('--base-dir',
        help="directory to look for files (in 'pdf' subdirectory)",
        default="fulltext_web")
    subparsers = parser.add_subparsers()

    args = parser.parse_args()
    args.session = requests_retry_session()

    run(args)

if __name__ == '__main__':
    main()