diff options
Diffstat (limited to 'check_issn_urls.py')
-rwxr-xr-x | check_issn_urls.py | 133 |
1 files changed, 77 insertions, 56 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py index 1135d6c..23169f1 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -45,17 +45,17 @@ def sniff_platform(resp): """ # these are mostly here to filter out huge platforms and stop sniffing domain_map = { - 'jstor.org/': 'jstor', - 'springer.com/': 'springer', - 'springerlink.com/': 'springer', - 'tandfonline.com/': 't_and_f', - 'elsevier.com/': 'elsevier', - 'wiley.com/': 'wiley', - 'sciencedirect.com/': 'elsevier', - 'sagepub.com/': 'sage', - 'hypotheses.org/': 'hypothesis', - 'tandf.co.uk/': 't_and_f', - 'scielo': 'scielo', + "jstor.org/": "jstor", + "springer.com/": "springer", + "springerlink.com/": "springer", + "tandfonline.com/": "t_and_f", + "elsevier.com/": "elsevier", + "wiley.com/": "wiley", + "sciencedirect.com/": "elsevier", + "sagepub.com/": "sage", + "hypotheses.org/": "hypothesis", + "tandf.co.uk/": "t_and_f", + "scielo": "scielo", } for domain, platform in domain_map.items(): if domain in resp.url: @@ -64,6 +64,7 @@ def sniff_platform(resp): return "ojs" return None + def sniff_blocked(resp): """ This function would try to figure out if we got blocked: soft-block, hard @@ -73,23 +74,33 @@ def sniff_blocked(resp): if resp.status_code in (403, 420): return True # JSTOR does this - if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text: + if ( + "Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA" + in resp.text + ): return True - if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text: + if ( + resp.status_code == 416 + and "something about your browser made us think you were a bot" in resp.text + ): return True return None -def check_gwb(url, match_type='exact'): - if '//web.archive.org/' in url: + +def check_gwb(url, match_type="exact"): + if "//web.archive.org/" in url: return None # crude/bad retry loop to work around CDX API throttling for i in range(5): - resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ - 'url': url, - 'matchType': match_type, - 'limit': -1, - 'filter': 'statuscode:200' - }) + resp = requests.get( + "https://web.archive.org/cdx/search/cdx", + params={ + "url": url, + "matchType": match_type, + "limit": -1, + "filter": "statuscode:200", + }, + ) if resp.status_code == 200: break time.sleep(5) @@ -98,81 +109,91 @@ def check_gwb(url, match_type='exact'): # TODO: this isn't really correct, but not sure what to return/record # if we failed through all timeouts return None - line = resp.text.strip().split('\n')[0] + line = resp.text.strip().split("\n")[0] if line: dt = line.split()[1] int(dt) return dt else: return None - + def check_url(issnl, url): - #print("Fetching: %s" % url) + # print("Fetching: %s" % url) info = dict(issnl=issnl, url=url) try: - resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'}) + resp = requests.get( + url, + timeout=30.0, + headers={ + "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org" + }, + ) except requests.exceptions.TooManyRedirects: - info['error'] = 'TooManyRedirects' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "TooManyRedirects" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.SSLError: - info['error'] = 'SSLError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "SSLError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ReadTimeout: - info['error'] = 'ReadTimeout' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ReadTimeout" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ConnectionError: - info['error'] = 'ConnectionError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ConnectionError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ChunkedEncodingError: - info['error'] = 'ChunkedEncodingError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ChunkedEncodingError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ContentDecodingError: - info['error'] = 'ContentDecodingError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ContentDecodingError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.InvalidSchema: - info['error'] = 'InvalidSchema' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "InvalidSchema" + info["terminal_status_code"] = info["status_code"] = -1 return info except UnicodeDecodeError: - info['error'] = 'UnicodeDecodeError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "UnicodeDecodeError" + info["terminal_status_code"] = info["status_code"] = -1 return info if resp.history: - info['status_code'] = resp.history[0].status_code + info["status_code"] = resp.history[0].status_code else: - info['status_code'] = resp.status_code + info["status_code"] = resp.status_code - info['terminal_status_code'] = resp.status_code - info['terminal_url'] = resp.url - content_type = resp.headers.get('Content-Type') + info["terminal_status_code"] = resp.status_code + info["terminal_url"] = resp.url + content_type = resp.headers.get("Content-Type") if content_type: - info['terminal_content_type'] = content_type.split(';')[0] - info['issnl_in_body'] = bool(issnl in resp.text) - info['gwb_url_success_dt'] = check_gwb(url, match_type='exact') - info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact') - info['blocked'] = sniff_blocked(resp) - info['software_platform'] = sniff_platform(resp) - #info['gwb_host_success_dt'] = check_gwb(url, match_type='host') + info["terminal_content_type"] = content_type.split(";")[0] + info["issnl_in_body"] = bool(issnl in resp.text) + info["gwb_url_success_dt"] = check_gwb(url, match_type="exact") + info["gwb_terminal_url_success_dt"] = check_gwb( + info["terminal_url"], match_type="exact" + ) + info["blocked"] = sniff_blocked(resp) + info["software_platform"] = sniff_platform(resp) + # info['gwb_host_success_dt'] = check_gwb(url, match_type='host') return info + def run(tsvfile): for line in tsvfile: - records = line.split('\t') + records = line.split("\t") issnl = records[0] url = records[1].strip() print(json.dumps(check_url(issnl, url))) -if __name__=="__main__": + +if __name__ == "__main__": if len(sys.argv) != 2: f = sys.stdin else: - f = open(sys.argv[1], 'r') + f = open(sys.argv[1], "r") run(f) |