1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
#!/usr/bin/env python3
"""
Takes *enriched* JSON objects which include a fatcat_release key/entity, and
populate fulltext content and metadata.
This script *only* looks for existing local files.
Keys added:
- fulltext_status: whether we could fetch or not (always added)
- fulltext_file: fatcat file entity, plus
- pdf_path
- pdftotext_path (if exists)
- thumbnail_path (if exists)
- grobid_xml_path (if exists)
- grobid_json_path (if exists)
- fulltext_grobid: grobid2json format, including:
- title
- authors
- journal
- abstract
- body
- acknowledgement
- annex
- language_code
- glutton_fatcat_release (renamed from fatcat_release)
- fulltext_pdftotext: only if fulltext_grobid not set
- body
"""
import sys
import json
import argparse
import datetime
from fatcat_covid19.common import *
def do_line(row, args):
if 'fulltext_file' in row:
return row
if not 'fatcat_release' in row:
row['fulltext_status'] = 'no-release'
return row
if not row['fatcat_release'].get('files'):
row['fulltext_status'] = 'no-file'
return row
fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=args.base_dir)
if not fulltext_file:
row['fulltext_status'] = 'no-local-file'
return row
else:
row['fulltext_status'] = 'found'
# ok, we have file, now populate derivatives etc
fulltext_file['pdf_path'] = blob_path(
fulltext_file['sha1'],
directory="pdf/",
file_suffix=".pdf",
base_dir=args.base_dir,
)
fulltext_file['pdftotext_path'] = blob_path(
fulltext_file['sha1'],
directory="pdftotext/",
file_suffix=".txt",
base_dir=args.base_dir,
)
fulltext_file['thumbnail_path'] = blob_path(
fulltext_file['sha1'],
directory="thumbnail/",
file_suffix=".png",
base_dir=args.base_dir,
)
fulltext_file['grobid_xml_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".xml",
base_dir=args.base_dir,
)
fulltext_file['grobid_json_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".json",
base_dir=args.base_dir,
)
# check if derivatives actually exist
for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
'grobid_json_path'):
if not os.path.isfile(fulltext_file[key]):
fulltext_file[key] = None
row['fulltext_file'] = fulltext_file
# if there is no GROBID, try pdftotext
if not fulltext_file['grobid_json_path']:
if fulltext_file['pdftotext_path']:
try:
with open(fulltext_file['pdftotext_path'], 'r') as f:
row['fulltext_pdftotext'] = dict(body=f.read())
except UnicodeDecodeError:
row['fulltext_status'] = 'bad-unicode-pdftotext'
return row
row['fulltext_status'] = 'success-pdftotext'
return row
else:
row['fulltext_status'] = 'no-extraction'
return row
with open(fulltext_file['grobid_json_path'], 'r') as f:
grobid = json.loads(f.read())
gfr = grobid.pop('fatcat_release', None)
if gfr:
grobid['glutton_fatcat_release'] = gfr
row['fulltext_grobid'] = grobid
row['fulltext_status'] = 'success-grobid'
return row
def run(args):
for l in args.json_file:
l = json.loads(l)
result = do_line(l, args)
if result:
print(json.dumps(result, sort_keys=True))
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
help="enriched (with fatcat_release) metadata file",
type=argparse.FileType('r'))
parser.add_argument('--base-dir',
help="directory to look for files (in 'pdf' subdirectory)",
default="fulltext_web")
subparsers = parser.add_subparsers()
args = parser.parse_args()
args.session = requests_retry_session()
run(args)
if __name__ == '__main__':
main()
|