1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
#!/usr/bin/env python3
"""
Transform an DOAJ article dump (JSON) into ingest requests.
TODO: should we also attempt PDF ingest for HTML links? They seem to often be
landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
import argparse
import json
import sys
from typing import List, Optional
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
# "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
".archive.org/",
# large publishers/platforms; may remove in the future
# "://link.springer.com/",
# "://dergipark.gov.tr/",
# "frontiersin.org/",
# "scielo",
]
# these default to PDF; note that we also do pdf ingests for HTML pages
CONTENT_TYPE_MAP = {
"abstract": [],
"doc": [],
"": ["pdf"],
"doi": ["pdf"],
"url": ["pdf"],
"fulltext": ["pdf"],
"anySimpleType": ["pdf"],
"application/pdf": ["pdf"],
"html": ["html", "pdf"],
"text/html": ["html", "pdf"],
"xml": ["xml"],
}
def canon(s: str) -> str:
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
def transform(obj: dict) -> List[dict]:
"""
Transforms from a single DOAJ object to zero or more ingest requests.
Returns a list of dicts.
"""
doaj_id = obj["id"].lower()
assert doaj_id
bibjson = obj["bibjson"]
if not bibjson["link"]:
return []
requests = []
doi: Optional[str] = None
for ident in bibjson["identifier"] or []:
if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
doi = ident["id"].lower()
for link in bibjson["link"] or []:
if link.get("type") != "fulltext" or not link.get("url"):
continue
ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
if not ingest_types:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in link["url"].lower():
skip = True
if skip:
continue
try:
base_url = canon(link["url"].strip())
except UnicodeEncodeError:
continue
if not base_url or len(base_url) > 1000:
continue
for ingest_type in ingest_types:
request = {
"base_url": base_url,
"ingest_type": ingest_type,
"link_source": "doaj",
"link_source_id": doaj_id,
"ingest_request_source": "doaj",
"release_stage": "published",
"rel": "publisher",
"ext_ids": {
"doi": doi,
"doaj": doaj_id,
},
"edit_extra": {},
}
requests.append(request)
return requests
def run(args) -> None:
for l in args.json_file:
if not l.strip():
continue
row = json.loads(l)
requests = transform(row) or []
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
)
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
if __name__ == "__main__":
main()
|