aboutsummaryrefslogtreecommitdiffstats
path: root/check_issn_urls.py
diff options
context:
space:
mode:
Diffstat (limited to 'check_issn_urls.py')
-rwxr-xr-xcheck_issn_urls.py133
1 files changed, 77 insertions, 56 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 1135d6c..23169f1 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -45,17 +45,17 @@ def sniff_platform(resp):
"""
# these are mostly here to filter out huge platforms and stop sniffing
domain_map = {
- 'jstor.org/': 'jstor',
- 'springer.com/': 'springer',
- 'springerlink.com/': 'springer',
- 'tandfonline.com/': 't_and_f',
- 'elsevier.com/': 'elsevier',
- 'wiley.com/': 'wiley',
- 'sciencedirect.com/': 'elsevier',
- 'sagepub.com/': 'sage',
- 'hypotheses.org/': 'hypothesis',
- 'tandf.co.uk/': 't_and_f',
- 'scielo': 'scielo',
+ "jstor.org/": "jstor",
+ "springer.com/": "springer",
+ "springerlink.com/": "springer",
+ "tandfonline.com/": "t_and_f",
+ "elsevier.com/": "elsevier",
+ "wiley.com/": "wiley",
+ "sciencedirect.com/": "elsevier",
+ "sagepub.com/": "sage",
+ "hypotheses.org/": "hypothesis",
+ "tandf.co.uk/": "t_and_f",
+ "scielo": "scielo",
}
for domain, platform in domain_map.items():
if domain in resp.url:
@@ -64,6 +64,7 @@ def sniff_platform(resp):
return "ojs"
return None
+
def sniff_blocked(resp):
"""
This function would try to figure out if we got blocked: soft-block, hard
@@ -73,23 +74,33 @@ def sniff_blocked(resp):
if resp.status_code in (403, 420):
return True
# JSTOR does this
- if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
+ if (
+ "Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA"
+ in resp.text
+ ):
return True
- if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
+ if (
+ resp.status_code == 416
+ and "something about your browser made us think you were a bot" in resp.text
+ ):
return True
return None
-def check_gwb(url, match_type='exact'):
- if '//web.archive.org/' in url:
+
+def check_gwb(url, match_type="exact"):
+ if "//web.archive.org/" in url:
return None
# crude/bad retry loop to work around CDX API throttling
for i in range(5):
- resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
- 'url': url,
- 'matchType': match_type,
- 'limit': -1,
- 'filter': 'statuscode:200'
- })
+ resp = requests.get(
+ "https://web.archive.org/cdx/search/cdx",
+ params={
+ "url": url,
+ "matchType": match_type,
+ "limit": -1,
+ "filter": "statuscode:200",
+ },
+ )
if resp.status_code == 200:
break
time.sleep(5)
@@ -98,81 +109,91 @@ def check_gwb(url, match_type='exact'):
# TODO: this isn't really correct, but not sure what to return/record
# if we failed through all timeouts
return None
- line = resp.text.strip().split('\n')[0]
+ line = resp.text.strip().split("\n")[0]
if line:
dt = line.split()[1]
int(dt)
return dt
else:
return None
-
+
def check_url(issnl, url):
- #print("Fetching: %s" % url)
+ # print("Fetching: %s" % url)
info = dict(issnl=issnl, url=url)
try:
- resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
+ resp = requests.get(
+ url,
+ timeout=30.0,
+ headers={
+ "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org"
+ },
+ )
except requests.exceptions.TooManyRedirects:
- info['error'] = 'TooManyRedirects'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "TooManyRedirects"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.SSLError:
- info['error'] = 'SSLError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "SSLError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ReadTimeout:
- info['error'] = 'ReadTimeout'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ReadTimeout"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ConnectionError:
- info['error'] = 'ConnectionError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ConnectionError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ChunkedEncodingError:
- info['error'] = 'ChunkedEncodingError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ChunkedEncodingError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ContentDecodingError:
- info['error'] = 'ContentDecodingError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ContentDecodingError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.InvalidSchema:
- info['error'] = 'InvalidSchema'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "InvalidSchema"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except UnicodeDecodeError:
- info['error'] = 'UnicodeDecodeError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "UnicodeDecodeError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
if resp.history:
- info['status_code'] = resp.history[0].status_code
+ info["status_code"] = resp.history[0].status_code
else:
- info['status_code'] = resp.status_code
+ info["status_code"] = resp.status_code
- info['terminal_status_code'] = resp.status_code
- info['terminal_url'] = resp.url
- content_type = resp.headers.get('Content-Type')
+ info["terminal_status_code"] = resp.status_code
+ info["terminal_url"] = resp.url
+ content_type = resp.headers.get("Content-Type")
if content_type:
- info['terminal_content_type'] = content_type.split(';')[0]
- info['issnl_in_body'] = bool(issnl in resp.text)
- info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
- info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
- info['blocked'] = sniff_blocked(resp)
- info['software_platform'] = sniff_platform(resp)
- #info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
+ info["terminal_content_type"] = content_type.split(";")[0]
+ info["issnl_in_body"] = bool(issnl in resp.text)
+ info["gwb_url_success_dt"] = check_gwb(url, match_type="exact")
+ info["gwb_terminal_url_success_dt"] = check_gwb(
+ info["terminal_url"], match_type="exact"
+ )
+ info["blocked"] = sniff_blocked(resp)
+ info["software_platform"] = sniff_platform(resp)
+ # info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
return info
+
def run(tsvfile):
for line in tsvfile:
- records = line.split('\t')
+ records = line.split("\t")
issnl = records[0]
url = records[1].strip()
print(json.dumps(check_url(issnl, url)))
-if __name__=="__main__":
+
+if __name__ == "__main__":
if len(sys.argv) != 2:
f = sys.stdin
else:
- f = open(sys.argv[1], 'r')
+ f = open(sys.argv[1], "r")
run(f)