1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
#!/usr/bin/env python3
"""
Input is IA item metadata JSON.
Ouput is insertable fatcat "match" JSON
- md5
- sha1
- sha256
- size
- urls
- cdx (list; empty here)
- dois (list)
- pmcid
- jstor_id
- arxiv_id
When invoking import matched, be sure to:
--default-link-rel repository (?)
--default-mimetype application/pdf
"""
import sys
import json
def parse(obj):
if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
sys.stderr.write('skip: test item\n')
return None
extid_type = None
extid = None
if obj['metadata']['identifier'].startswith('arxiv-'):
extid_type = 'arxiv_id'
extid = obj['metadata'].get('source')
if not extid:
sys.stderr.write('skip: no source\n')
return None
assert extid.startswith('http://arxiv.org/abs/')
extid = extid.replace('http://arxiv.org/abs/', '')
#print(extid)
assert '/' in extid or '.' in extid
if not 'v' in extid or not extid[-1].isdigit():
sys.stderr.write('skip: non-versioned arxiv_id\n')
return None
elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
extid_type = 'doi'
extid = obj['metadata']['identifier-doi']
assert extid.startswith("10.")
elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
extid_type = 'pmcid'
extid = obj['metadata']['identifier'].replace('pubmed-', '')
assert extid.startswith("PMC")
int(extid[3:])
elif obj['metadata']['identifier'].startswith('jstor-'):
extid_type = 'jstor_id'
extid = obj['metadata']['identifier'].replace('jstor-', '')
int(extid)
else:
raise NotImplementedError()
pdf_file = None
for f in obj['files']:
if f['source'] == "original" and "PDF" in f['format']:
pdf_file = f
break
if not pdf_file:
sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier']))
#for f in obj['files']:
# sys.stderr.write(f['format'] + "\n")
return None
assert pdf_file['name'].endswith('.pdf')
match = {
'md5': pdf_file['md5'],
'sha1': pdf_file['sha1'],
'size': int(pdf_file['size']),
'mimetype': 'application/pdf',
'urls': [
"https://archive.org/download/{}/{}".format(
obj['metadata']['identifier'],
pdf_file['name']),
],
'cdx': [],
'dois': [],
}
if extid_type == 'doi':
match['dois'] = [extid,]
else:
match[extid_type] = extid
return match
def run():
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
if match:
print(json.dumps(match))
if __name__ == '__main__':
run()
|