1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
#!/usr/bin/env python3
"""
Check journal homepage status (live web and wayback)
Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can:
# be sure to randomize order if you are going to use high parallelism so no
# individual domain gets swamped. also remember this hits CDX API multiple
# times.
parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json
Input columns (no header):
ISSN-L, URL
For each URL, do a request and record, as JSON:
issnl: passthrough
url: passthrough
status_code: initial HTTP crawl status
terminal_url: final URL (or original if no redirects)
terminal_status_code: final URL (or original if no redirects)
terminal_content_type: content type (mimetype)
platform_software: slug of hosting platform, if detected
issnl_in_body: whether raw issnl appears in body text
blocked: whether we think crawler was "blocked"
gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists
gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists
HTTP status will be -1 if domain does not even resolve.
"""
import os
import sys
import json
import time
import requests
def sniff_platform(resp):
"""
This function would try to figure out what software platform (eg, OJS) the
site is running.
TODO: unimplemented
"""
# these are mostly here to filter out huge platforms and stop sniffing
domain_map = {
'jstor.org/': 'jstor',
'springer.com/': 'springer',
'springerlink.com/': 'springer',
'tandfonline.com/': 't_and_f',
'elsevier.com/': 'elsevier',
'wiley.com/': 'wiley',
'sciencedirect.com/': 'elsevier',
'sagepub.com/': 'sage',
'hypotheses.org/': 'hypothesis',
'tandf.co.uk/': 't_and_f',
'scielo': 'scielo',
}
for domain, platform in domain_map.items():
if domain in resp.url:
return platform
if '<meta name="generator" content="Open Journal Systems' in resp.text:
return "ojs"
return None
def sniff_blocked(resp):
"""
This function would try to figure out if we got blocked: soft-block, hard
block, etc.
TODO: unimplemented
"""
if resp.status_code in (403, 420):
return True
# JSTOR does this
if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
return True
if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
return True
return None
def check_gwb(url, match_type='exact'):
if '//web.archive.org/' in url:
return None
# crude/bad retry loop to work around CDX API throttling
for i in range(5):
resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
'url': url,
'matchType': match_type,
'limit': -1,
'filter': 'statuscode:200'
})
if resp.status_code == 200:
break
time.sleep(5)
if not resp.status_code == 200:
sys.stderr.write("CDX ERR {}: {}".format(resp.status_code, url))
return 'error'
line = resp.text.strip().split('\n')[0]
if line:
dt = line.split()[1]
int(dt)
return dt
else:
return None
def check_url(issnl, url):
#print("Fetching: %s" % url)
info = dict(issnl=issnl, url=url)
try:
resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
except requests.exceptions.TooManyRedirects:
info['error'] = 'TooManyRedirects'
info['terminal_status_code'] = info['status_code'] = -1
return info
except requests.exceptions.SSLError:
info['error'] = 'SSLError'
info['terminal_status_code'] = info['status_code'] = -1
return info
except requests.exceptions.ReadTimeout:
info['error'] = 'ReadTimeout'
info['terminal_status_code'] = info['status_code'] = -1
return info
except requests.exceptions.ConnectionError:
info['error'] = 'ConnectionError'
info['terminal_status_code'] = info['status_code'] = -1
return info
except requests.exceptions.ChunkedEncodingError:
info['error'] = 'ChunkedEncodingError'
info['terminal_status_code'] = info['status_code'] = -1
return info
except requests.exceptions.ContentDecodingError:
info['error'] = 'ContentDecodingError'
info['terminal_status_code'] = info['status_code'] = -1
return info
if resp.history:
info['status_code'] = resp.history[0].status_code
else:
info['status_code'] = resp.status_code
info['terminal_status_code'] = resp.status_code
info['terminal_url'] = resp.url
content_type = resp.headers.get('Content-Type')
if content_type:
info['terminal_content_type'] = content_type.split(';')[0]
info['issnl_in_body'] = bool(issnl in resp.text)
info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
info['blocked'] = sniff_blocked(resp)
info['software_platform'] = sniff_platform(resp)
#info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
return info
def run(tsvfile):
for line in tsvfile:
records = line.split('\t')
issnl = records[0]
url = records[1].strip()
print(json.dumps(check_url(issnl, url)))
if __name__=="__main__":
if len(sys.argv) != 2:
f = sys.stdin
else:
f = open(sys.argv[1], 'r')
run(f)
|