From a13787eaa6738e5f2ffb29d1d4d9a83617a1b943 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 30 Nov 2021 14:18:00 -0800 Subject: move skip logic from Makefile to check_issn_urls --- Makefile | 2 +- check_issn_urls.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5b14370..28108af 100644 --- a/Makefile +++ b/Makefile @@ -95,7 +95,7 @@ update-sources: data/$(TODAY)/kbart_JSTOR.txt data/$(TODAY)/kbart_CLOCKSS.txt da @echo "Successfully updated for date (UTC): $(TODAY)" data/$(TODAY)/homepage_status.json: - pipenv run python -m chocula export_urls | rg -v www.jstor.org | rg -v www.tandfonline.com | rg -v www.sciencedirect.com | rg -v link.springer.com | rg -v onlinelibrary.wiley.com | rg -v dialnet.unirioja.es | rg -v www.springer.com | rg -v www.journals.elsevier.com | rg -v web.archive.org | rg -v catalog.hathitrust.org | shuf | pv -l > /tmp/chocula_urls.tsv + pipenv run python -m chocula export_urls | shuf | pv -l > /tmp/chocula_urls.tsv pipenv run parallel -j10 --pipepart --line-buffer -a /tmp/chocula_urls.tsv ./check_issn_urls.py | pv -l > /tmp/homepage_status.json mv /tmp/homepage_status.json $@ diff --git a/check_issn_urls.py b/check_issn_urls.py index 6fbb05d..b00609f 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -120,6 +120,18 @@ def check_gwb(url, match_type="exact"): else: return None +HOST_SKIP_LIST = [ + "www.jstor.org", + "www.tandfonline.com", + "www.sciencedirect.com", + "link.springer.com", + "onlinelibrary.wiley.com", + "dialnet.unirioja.es", + "www.springer.com", + "www.journals.elsevier.com", + "web.archive.org", + "catalog.hathitrust.org", +] def check_url(issnl, url): # print("Fetching: %s" % url) @@ -132,6 +144,12 @@ def check_url(issnl, url): info["error"] = "url-not-http" info["terminal_status_code"] = -1 return info + for host in HOST_SKIP_LIST: + if f"://{host}/" in url: + info["error"] = "skip-host" + info["terminal_status_code"] = -1 + return info + try: resp = requests.get( url, -- cgit v1.2.3