From 547d50965dfd84fe03da3b85737e9a67bfab797b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Jul 2022 14:36:38 -0700 Subject: more homepage domains to ignore (and resort) --- chocula/database.py | 61 +++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'chocula') diff --git a/chocula/database.py b/chocula/database.py index db14700..70aa974 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -66,7 +66,8 @@ class HomepageUrl: or "://search.ebscohost.com" in url or "://search.proquest.com" in url or "://gateway.proquest.com" in url - or "://doaj.org" in url + or "://nbn-resolving.org/" in url + or "://e-helvetica.nb.admin.ch/" in url ): return None @@ -742,42 +743,46 @@ class ChoculaDatabase: webarchive_urls.append(hrow["url"]) continue if hrow["host"] in ( - "doaj.org", - "www.doaj.org", - "www.ncbi.nlm.nih.gov", - "www.google.com", - "books.google.com", - "translate.google.com", - "drive.google.com", - "mail.google.com", - "play.google.com", - "news.google.com", - "docs.google.com", - "goo.gl", - "www.loc.gov", - "search.ebscohost.com", + "arxiv.org", "bibpurl.oclc.org", + "books.google.com", "catalog.hathitrust.org", - "www.thefreelibrary.com", - "goo.gl", + "crcnetbase.com", + "doaj.org", + "docs.google.com", + "drive.google.com", "dx.doi.org", + "e-helvetica.nb.admin.ch", "firstsearch.oclc.org", - "www.umi.com", - "umi.com", - "search.informit.com.au", - "search.ebscohost.com", - "search.proquest.com", "gateway.proquest.com", - "purl.access.gpo.gov", - "arxiv.org", - "pubmedcentral.nih.gov", - "ncbi.nlm.nih.gov", + "goo.gl", "heinonline.org", - "www.heinonline.org", - "crcnetbase.com", + "mail.google.com", + "nbn-resolving.org", + "ncbi.nlm.nih.gov", + "news.google.com", "nla.gov.au", + "opacplus.bsb-muenchen.de", + "play.google.com", + "proquest.umi.com", + "pubmedcentral.nih.gov", + "purl.access.gpo.gov", "purl.nla.gov.au", + "search.ebscohost.com", + "search.epnet.com", + "search.informit.com.au", + "search.proquest.com", + "translate.google.com", + "umi.com", "www.bibliothek.uni-regensburg.de", + "www.doaj.org", + "www.e-helvetica.nb.admin.ch", + "www.google.com", + "www.heinonline.org", + "www.loc.gov", + "www.ncbi.nlm.nih.gov", + "www.thefreelibrary.com", + "www.umi.com", "zdb.uni-bielefeld.de", ): # individual books or google searches, not journal/conference homepages -- cgit v1.2.3