aboutsummaryrefslogtreecommitdiffstats
path: root/check_issn_urls.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-30 14:18:00 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-30 14:18:00 -0800
commita13787eaa6738e5f2ffb29d1d4d9a83617a1b943 (patch)
tree74a8737c9ca3ca9d12fe9138d0de1e7e148108bc /check_issn_urls.py
parent4d32b131d6fb7ae22b7c6533350ea614ba4139e0 (diff)
downloadchocula-a13787eaa6738e5f2ffb29d1d4d9a83617a1b943.tar.gz
chocula-a13787eaa6738e5f2ffb29d1d4d9a83617a1b943.zip
move skip logic from Makefile to check_issn_urls
Diffstat (limited to 'check_issn_urls.py')
-rwxr-xr-xcheck_issn_urls.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 6fbb05d..b00609f 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -120,6 +120,18 @@ def check_gwb(url, match_type="exact"):
else:
return None
+HOST_SKIP_LIST = [
+ "www.jstor.org",
+ "www.tandfonline.com",
+ "www.sciencedirect.com",
+ "link.springer.com",
+ "onlinelibrary.wiley.com",
+ "dialnet.unirioja.es",
+ "www.springer.com",
+ "www.journals.elsevier.com",
+ "web.archive.org",
+ "catalog.hathitrust.org",
+]
def check_url(issnl, url):
# print("Fetching: %s" % url)
@@ -132,6 +144,12 @@ def check_url(issnl, url):
info["error"] = "url-not-http"
info["terminal_status_code"] = -1
return info
+ for host in HOST_SKIP_LIST:
+ if f"://{host}/" in url:
+ info["error"] = "skip-host"
+ info["terminal_status_code"] = -1
+ return info
+
try:
resp = requests.get(
url,