aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-30 14:18:00 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-30 14:18:00 -0800
commita13787eaa6738e5f2ffb29d1d4d9a83617a1b943 (patch)
tree74a8737c9ca3ca9d12fe9138d0de1e7e148108bc
parent4d32b131d6fb7ae22b7c6533350ea614ba4139e0 (diff)
downloadchocula-a13787eaa6738e5f2ffb29d1d4d9a83617a1b943.tar.gz
chocula-a13787eaa6738e5f2ffb29d1d4d9a83617a1b943.zip
move skip logic from Makefile to check_issn_urls
-rw-r--r--Makefile2
-rwxr-xr-xcheck_issn_urls.py18
2 files changed, 19 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 5b14370..28108af 100644
--- a/Makefile
+++ b/Makefile
@@ -95,7 +95,7 @@ update-sources: data/$(TODAY)/kbart_JSTOR.txt data/$(TODAY)/kbart_CLOCKSS.txt da
@echo "Successfully updated for date (UTC): $(TODAY)"
data/$(TODAY)/homepage_status.json:
- pipenv run python -m chocula export_urls | rg -v www.jstor.org | rg -v www.tandfonline.com | rg -v www.sciencedirect.com | rg -v link.springer.com | rg -v onlinelibrary.wiley.com | rg -v dialnet.unirioja.es | rg -v www.springer.com | rg -v www.journals.elsevier.com | rg -v web.archive.org | rg -v catalog.hathitrust.org | shuf | pv -l > /tmp/chocula_urls.tsv
+ pipenv run python -m chocula export_urls | shuf | pv -l > /tmp/chocula_urls.tsv
pipenv run parallel -j10 --pipepart --line-buffer -a /tmp/chocula_urls.tsv ./check_issn_urls.py | pv -l > /tmp/homepage_status.json
mv /tmp/homepage_status.json $@
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 6fbb05d..b00609f 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -120,6 +120,18 @@ def check_gwb(url, match_type="exact"):
else:
return None
+HOST_SKIP_LIST = [
+ "www.jstor.org",
+ "www.tandfonline.com",
+ "www.sciencedirect.com",
+ "link.springer.com",
+ "onlinelibrary.wiley.com",
+ "dialnet.unirioja.es",
+ "www.springer.com",
+ "www.journals.elsevier.com",
+ "web.archive.org",
+ "catalog.hathitrust.org",
+]
def check_url(issnl, url):
# print("Fetching: %s" % url)
@@ -132,6 +144,12 @@ def check_url(issnl, url):
info["error"] = "url-not-http"
info["terminal_status_code"] = -1
return info
+ for host in HOST_SKIP_LIST:
+ if f"://{host}/" in url:
+ info["error"] = "skip-host"
+ info["terminal_status_code"] = -1
+ return info
+
try:
resp = requests.get(
url,