1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#!/usr/bin/env python3
"""
Input is IA item metadata JSON.
Ouput is insertable fatcat "match" JSON
- md5
- sha1
- sha256
- size
- urls
- cdx (list; empty here)
- dois (list)
- pmcid
- jstor
- arxiv
When invoking import matched, be sure to:
--default-link-rel repository (?)
--default-mimetype application/pdf
"""
import json
import sys
from typing import Any, Dict, Optional
def parse(obj: dict) -> Optional[Dict[str, Any]]:
if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"):
print("skip: test item", file=sys.stderr)
return None
extid_type = None
extid = None
if obj["metadata"]["identifier"].startswith("arxiv-"):
extid_type = "arxiv"
extid = obj["metadata"].get("source")
if not extid:
print("skip: no source", file=sys.stderr)
return None
assert extid.startswith("http://arxiv.org/abs/")
extid = extid.replace("http://arxiv.org/abs/", "")
# print(extid)
assert "/" in extid or "." in extid
if "v" not in extid or not extid[-1].isdigit():
print("skip: non-versioned arxiv_id", file=sys.stderr)
return None
elif obj["metadata"]["identifier"].startswith("paper-doi-10_"):
extid_type = "doi"
extid = obj["metadata"]["identifier-doi"]
assert extid.startswith("10.")
elif obj["metadata"]["identifier"].startswith("pubmed-PMC"):
extid_type = "pmcid"
extid = obj["metadata"]["identifier"].replace("pubmed-", "")
assert extid.startswith("PMC")
int(extid[3:])
elif obj["metadata"]["identifier"].startswith("jstor-"):
extid_type = "jstor"
extid = obj["metadata"]["identifier"].replace("jstor-", "")
int(extid)
else:
raise NotImplementedError()
pdf_file = None
for f in obj["files"]:
if f["source"] == "original" and "PDF" in f["format"]:
pdf_file = f
break
if not pdf_file:
print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr)
# for f in obj['files']:
# print(f['format'], file=sys.stderr)
return None
assert pdf_file["name"].endswith(".pdf")
match = {
"md5": pdf_file["md5"],
"sha1": pdf_file["sha1"],
"size": int(pdf_file["size"]),
"mimetype": "application/pdf",
"urls": [
"https://archive.org/download/{}/{}".format(
obj["metadata"]["identifier"], pdf_file["name"]
),
],
"cdx": [],
"dois": [],
}
if extid_type == "doi":
match["dois"] = [
extid,
]
else:
match[extid_type] = extid
return match
def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
if match is not None:
print(json.dumps(match, sort_keys=True))
if __name__ == "__main__":
run()
|