aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-21 14:36:38 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-21 14:36:38 -0700
commit547d50965dfd84fe03da3b85737e9a67bfab797b (patch)
tree65a26c22a8fdeae3b4f3de62376732703b045af2
parent19be6a657c695a4a9bcd77dbda25bcb2381a15f8 (diff)
downloadchocula-547d50965dfd84fe03da3b85737e9a67bfab797b.tar.gz
chocula-547d50965dfd84fe03da3b85737e9a67bfab797b.zip
more homepage domains to ignore (and resort)
-rw-r--r--chocula/database.py61
1 files changed, 33 insertions, 28 deletions
diff --git a/chocula/database.py b/chocula/database.py
index db14700..70aa974 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -66,7 +66,8 @@ class HomepageUrl:
or "://search.ebscohost.com" in url
or "://search.proquest.com" in url
or "://gateway.proquest.com" in url
- or "://doaj.org" in url
+ or "://nbn-resolving.org/" in url
+ or "://e-helvetica.nb.admin.ch/" in url
):
return None
@@ -742,42 +743,46 @@ class ChoculaDatabase:
webarchive_urls.append(hrow["url"])
continue
if hrow["host"] in (
- "doaj.org",
- "www.doaj.org",
- "www.ncbi.nlm.nih.gov",
- "www.google.com",
- "books.google.com",
- "translate.google.com",
- "drive.google.com",
- "mail.google.com",
- "play.google.com",
- "news.google.com",
- "docs.google.com",
- "goo.gl",
- "www.loc.gov",
- "search.ebscohost.com",
+ "arxiv.org",
"bibpurl.oclc.org",
+ "books.google.com",
"catalog.hathitrust.org",
- "www.thefreelibrary.com",
- "goo.gl",
+ "crcnetbase.com",
+ "doaj.org",
+ "docs.google.com",
+ "drive.google.com",
"dx.doi.org",
+ "e-helvetica.nb.admin.ch",
"firstsearch.oclc.org",
- "www.umi.com",
- "umi.com",
- "search.informit.com.au",
- "search.ebscohost.com",
- "search.proquest.com",
"gateway.proquest.com",
- "purl.access.gpo.gov",
- "arxiv.org",
- "pubmedcentral.nih.gov",
- "ncbi.nlm.nih.gov",
+ "goo.gl",
"heinonline.org",
- "www.heinonline.org",
- "crcnetbase.com",
+ "mail.google.com",
+ "nbn-resolving.org",
+ "ncbi.nlm.nih.gov",
+ "news.google.com",
"nla.gov.au",
+ "opacplus.bsb-muenchen.de",
+ "play.google.com",
+ "proquest.umi.com",
+ "pubmedcentral.nih.gov",
+ "purl.access.gpo.gov",
"purl.nla.gov.au",
+ "search.ebscohost.com",
+ "search.epnet.com",
+ "search.informit.com.au",
+ "search.proquest.com",
+ "translate.google.com",
+ "umi.com",
"www.bibliothek.uni-regensburg.de",
+ "www.doaj.org",
+ "www.e-helvetica.nb.admin.ch",
+ "www.google.com",
+ "www.heinonline.org",
+ "www.loc.gov",
+ "www.ncbi.nlm.nih.gov",
+ "www.thefreelibrary.com",
+ "www.umi.com",
"zdb.uni-bielefeld.de",
):
# individual books or google searches, not journal/conference homepages