diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-24 18:13:00 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-24 18:13:00 -0800 |
commit | 1f3382703e86190efd11f74bce00d61f64c8b174 (patch) | |
tree | b48dd8e82943daf72ee7683a84e3efe7b7f86fb9 | |
parent | 3e0a14e3d61f65e25f659d7f8b34aac7d0d223e6 (diff) | |
download | chocula-1f3382703e86190efd11f74bce00d61f64c8b174.tar.gz chocula-1f3382703e86190efd11f74bce00d61f64c8b174.zip |
check_issn_urls.py: yet more hacks in exceptions
-rwxr-xr-x | check_issn_urls.py | 56 |
1 files changed, 41 insertions, 15 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py index 4391324..bb54259 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -90,19 +90,23 @@ def check_gwb(url, match_type="exact"): if "//web.archive.org/" in url: return None # crude/bad retry loop to work around CDX API throttling - for i in range(5): - resp = requests.get( - "https://web.archive.org/cdx/search/cdx", - params={ - "url": url, - "matchType": match_type, - "limit": -1, - "filter": "statuscode:200", - }, - ) - if resp.status_code == 200: + for i in range(2): + try: + resp = requests.get( + "https://web.archive.org/cdx/search/cdx", + params={ + "url": url, + "matchType": match_type, + "limit": -1, + "filter": "statuscode:200", + }, + ) + except Exception as e: + # nasty blanket catch + return None + if resp.status_code not in [200, 404]: break - time.sleep(5) + time.sleep(0.1) if not resp.status_code == 200: sys.stderr.write("CDX ERR {}: {}\n".format(resp.status_code, url)) # TODO: this isn't really correct, but not sure what to return/record @@ -120,10 +124,18 @@ def check_gwb(url, match_type="exact"): def check_url(issnl, url): # print("Fetching: %s" % url) info = dict(issnl=issnl, url=url) + if "://" not in url: + info["error"] = "bad-url" + info["terminal_status_code"] = -1 + return info + if not url.startswith('http'): + info["error"] = "url-not-http" + info["terminal_status_code"] = -1 + return info try: resp = requests.get( url, - timeout=15.0, + timeout=(5.0, 5.0), headers={ "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org" }, @@ -156,14 +168,28 @@ def check_url(issnl, url): info["error"] = "InvalidSchema" info["terminal_status_code"] = info["status_code"] = -1 return info - except requests.exceptions.RemoteDisconnected: - info["error"] = "RemoteDisconnected" + except ConnectionResetError: + info["error"] = "ConnectionResetError" + info["terminal_status_code"] = info["status_code"] = -1 + return info + except requests.exceptions.ProtocolError: + info["error"] = "ProtocolError" + info["terminal_status_code"] = info["status_code"] = -1 + return info + except requests.exceptions.InvalidURL: + info["error"] = "ProtocolError" info["terminal_status_code"] = info["status_code"] = -1 return info except UnicodeDecodeError: info["error"] = "UnicodeDecodeError" info["terminal_status_code"] = info["status_code"] = -1 return info + except Exception as e: + # nasty blanket catch + print(e, file=sys.stderr) + info["error"] = "other" + info["terminal_status_code"] = info["status_code"] = -1 + return info if resp.history: info["status_code"] = resp.history[0].status_code |