From 3945ef26d5024e4efe81374b8eb562ffd5b09613 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 18:39:45 -0700 Subject: block/skip more homepage patterns --- chocula/database.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/chocula/database.py b/chocula/database.py index 54d3b60..b800f65 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -49,6 +49,11 @@ class HomepageUrl: or "mailto:" in url.lower() or url.lower() in ("http://n/a", "http://na/", "http://na") or "LOCKSS_RESOLVER" in url + or "$result.AccessURL" in url + or "://firstsearch.oclc.org" in url + or "://bibpurl.oclc.org" in url + or "://books.google.com" in url + or "://search.ebscohost.com" in url ): return None if url.startswith("www."): @@ -703,6 +708,10 @@ class ChoculaDatabase: "www.thefreelibrary.com", "goo.gl", "dx.doi.org", + "firstsearch.oclc.org", + "www.umi.com", + "search.informit.com.au", + "search.ebscohost.com", ): # individual books or google searches, not journal/conference homepages # LOC scanned newspapers -- cgit v1.2.3