1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
import sys
import json
import argparse
import datetime
from fatcat_covid19.common import *
def enrich_derivatives_row(row, base_dir):
"""
Takes *enriched* JSON objects which include a fatcat_release key/entity, and
populate fulltext content and metadata.
This script *only* looks for existing local files.
Keys added:
- fulltext_status: whether we could fetch or not (always added)
- fulltext_file: fatcat file entity, plus
- pdf_path
- pdftotext_path (if exists)
- thumbnail_path (if exists)
- grobid_xml_path (if exists)
- grobid_json_path (if exists)
- fulltext_grobid: grobid2json format, including:
- title
- authors
- journal
- abstract
- body
- acknowledgement
- annex
- language_code
- glutton_fatcat_release (renamed from fatcat_release)
- fulltext_pdftotext: only if fulltext_grobid not set
- body
"""
if 'fulltext_file' in row:
return row
if not 'fatcat_release' in row:
row['fulltext_status'] = 'no-release'
return row
if not row['fatcat_release'].get('files'):
row['fulltext_status'] = 'no-file'
return row
fulltext_file = find_local_file(row['fatcat_release']['files'], base_dir=base_dir)
if not fulltext_file:
row['fulltext_status'] = 'no-local-file'
return row
else:
row['fulltext_status'] = 'found'
# ok, we have file, now populate derivatives etc
fulltext_file['pdf_path'] = blob_path(
fulltext_file['sha1'],
directory="pdf/",
file_suffix=".pdf",
base_dir=base_dir,
)
fulltext_file['pdftotext_path'] = blob_path(
fulltext_file['sha1'],
directory="pdftotext/",
file_suffix=".txt",
base_dir=base_dir,
)
fulltext_file['thumbnail_path'] = blob_path(
fulltext_file['sha1'],
directory="thumbnail/",
file_suffix=".png",
base_dir=base_dir,
)
fulltext_file['grobid_xml_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".xml",
base_dir=base_dir,
)
fulltext_file['grobid_json_path'] = blob_path(
fulltext_file['sha1'],
directory="grobid/",
file_suffix=".json",
base_dir=base_dir,
)
# check if derivatives actually exist
for key in ('pdftotext_path', 'thumbnail_path', 'grobid_xml_path',
'grobid_json_path'):
if not os.path.isfile(fulltext_file[key]):
fulltext_file[key] = None
row['fulltext_file'] = fulltext_file
# if there is no GROBID, try pdftotext
if not fulltext_file['grobid_json_path']:
if fulltext_file['pdftotext_path']:
try:
with open(fulltext_file['pdftotext_path'], 'r') as f:
row['fulltext_pdftotext'] = dict(body=f.read())
except UnicodeDecodeError:
row['fulltext_status'] = 'bad-unicode-pdftotext'
return row
row['fulltext_status'] = 'success-pdftotext'
return row
else:
row['fulltext_status'] = 'no-extraction'
return row
with open(fulltext_file['grobid_json_path'], 'r') as f:
grobid = json.loads(f.read())
gfr = grobid.pop('fatcat_release', None)
if gfr:
grobid['glutton_fatcat_release'] = gfr
row['fulltext_grobid'] = grobid
row['fulltext_status'] = 'success-grobid'
return row
def enrich_derivatives_file(json_input, json_output, base_dir):
"""
Reads lines from json_input (an open, readable file or similar), looks for
existing derivative files in base_dir (a path str), and writes string JSON
lines to json_output (an open, writable file or similar).
"""
for l in json_input:
l = json.loads(l)
result = do_line(l, base_dir)
if result:
print(json.dumps(result, sort_keys=True), file=json_output)
|