1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
#!/usr/bin/env python3
"""
Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
stdout.
The stdin thing means you can:
parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
For each URL, do a request and record:
ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
HTTP status will be -1 if domain does not even resolve.
"local HTTP status" is the HTTP status code modulo same-domain redirects. This
is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
Will be the same as HTTP status if redirect is non-local.
TODO: detect domain squating/spam?
"""
import os
import sys
import requests
def check_url(url):
#print("Fetching: %s" % url)
try:
resp = requests.get(url)
except:
return (url, "-1", "-1", '-')
if len(resp.history) > 0:
first_status = resp.history[0].status_code
else:
first_status = resp.status_code
return map(str, (url, first_status, resp.status_code, resp.url))
def run(tsvfile):
for line in tsvfile:
records = line.split('\t')
issnl = records[0]
url = records[1].strip()
print(issnl + '\t' + '\t'.join(check_url(url)))
if __name__=="__main__":
if len(sys.argv) != 2:
f = sys.stdin
else:
f = open(sys.argv[1], 'r')
run(f)
|