aboutsummaryrefslogtreecommitdiffstats
path: root/check_issn_urls.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-24 18:13:00 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-24 18:13:00 -0800
commit1f3382703e86190efd11f74bce00d61f64c8b174 (patch)
treeb48dd8e82943daf72ee7683a84e3efe7b7f86fb9 /check_issn_urls.py
parent3e0a14e3d61f65e25f659d7f8b34aac7d0d223e6 (diff)
downloadchocula-1f3382703e86190efd11f74bce00d61f64c8b174.tar.gz
chocula-1f3382703e86190efd11f74bce00d61f64c8b174.zip
check_issn_urls.py: yet more hacks in exceptions
Diffstat (limited to 'check_issn_urls.py')
-rwxr-xr-xcheck_issn_urls.py56
1 files changed, 41 insertions, 15 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 4391324..bb54259 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -90,19 +90,23 @@ def check_gwb(url, match_type="exact"):
if "//web.archive.org/" in url:
return None
# crude/bad retry loop to work around CDX API throttling
- for i in range(5):
- resp = requests.get(
- "https://web.archive.org/cdx/search/cdx",
- params={
- "url": url,
- "matchType": match_type,
- "limit": -1,
- "filter": "statuscode:200",
- },
- )
- if resp.status_code == 200:
+ for i in range(2):
+ try:
+ resp = requests.get(
+ "https://web.archive.org/cdx/search/cdx",
+ params={
+ "url": url,
+ "matchType": match_type,
+ "limit": -1,
+ "filter": "statuscode:200",
+ },
+ )
+ except Exception as e:
+ # nasty blanket catch
+ return None
+ if resp.status_code not in [200, 404]:
break
- time.sleep(5)
+ time.sleep(0.1)
if not resp.status_code == 200:
sys.stderr.write("CDX ERR {}: {}\n".format(resp.status_code, url))
# TODO: this isn't really correct, but not sure what to return/record
@@ -120,10 +124,18 @@ def check_gwb(url, match_type="exact"):
def check_url(issnl, url):
# print("Fetching: %s" % url)
info = dict(issnl=issnl, url=url)
+ if "://" not in url:
+ info["error"] = "bad-url"
+ info["terminal_status_code"] = -1
+ return info
+ if not url.startswith('http'):
+ info["error"] = "url-not-http"
+ info["terminal_status_code"] = -1
+ return info
try:
resp = requests.get(
url,
- timeout=15.0,
+ timeout=(5.0, 5.0),
headers={
"User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org"
},
@@ -156,14 +168,28 @@ def check_url(issnl, url):
info["error"] = "InvalidSchema"
info["terminal_status_code"] = info["status_code"] = -1
return info
- except requests.exceptions.RemoteDisconnected:
- info["error"] = "RemoteDisconnected"
+ except ConnectionResetError:
+ info["error"] = "ConnectionResetError"
+ info["terminal_status_code"] = info["status_code"] = -1
+ return info
+ except requests.exceptions.ProtocolError:
+ info["error"] = "ProtocolError"
+ info["terminal_status_code"] = info["status_code"] = -1
+ return info
+ except requests.exceptions.InvalidURL:
+ info["error"] = "ProtocolError"
info["terminal_status_code"] = info["status_code"] = -1
return info
except UnicodeDecodeError:
info["error"] = "UnicodeDecodeError"
info["terminal_status_code"] = info["status_code"] = -1
return info
+ except Exception as e:
+ # nasty blanket catch
+ print(e, file=sys.stderr)
+ info["error"] = "other"
+ info["terminal_status_code"] = info["status_code"] = -1
+ return info
if resp.history:
info["status_code"] = resp.history[0].status_code