From 8c23bec37e410defa219650e13bb5b2aa3b3c974 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 15:01:20 -0700 Subject: filter out more meta/index URL hosts --- chocula/database.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/chocula/database.py b/chocula/database.py index 4606212..2588f60 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -48,6 +48,7 @@ class HomepageUrl: not url or "mailto:" in url.lower() or url.lower() in ("http://n/a", "http://na/", "http://na") + or "LOCKSS_RESOLVER" in url ): return None if url.startswith("www."): @@ -681,12 +682,25 @@ class ChoculaDatabase: continue if "://www.ncbi.nlm.nih.gov/" in hrow["url"]: continue + if "LOCKSS_RESOLVER" in hrow["url"]: + continue if "web.archive.org/web" in hrow["url"]: webarchive_urls.append(hrow["url"]) urls.append(hrow["url"]) continue - if hrow["host"] in ("www.google.com", "books.google.com"): + if hrow["host"] in ( + "www.google.com", + "books.google.com", + "www.loc.gov", + "search.ebscohost.com", + "bibpurl.oclc.org", + "catalog.hathitrust.org", + "www.thefreelibrary.com", + "goo.gl", + "dx.doi.org", + ): # individual books or google searches, not journal/conference homepages + # LOC scanned newspapers continue if "/oai/request" in hrow["url"]: # OAI-PMH endpoints, not homepages -- cgit v1.2.3