diff options
-rw-r--r-- | chocula/database.py | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/chocula/database.py b/chocula/database.py index 4606212..2588f60 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -48,6 +48,7 @@ class HomepageUrl: not url or "mailto:" in url.lower() or url.lower() in ("http://n/a", "http://na/", "http://na") + or "LOCKSS_RESOLVER" in url ): return None if url.startswith("www."): @@ -681,12 +682,25 @@ class ChoculaDatabase: continue if "://www.ncbi.nlm.nih.gov/" in hrow["url"]: continue + if "LOCKSS_RESOLVER" in hrow["url"]: + continue if "web.archive.org/web" in hrow["url"]: webarchive_urls.append(hrow["url"]) urls.append(hrow["url"]) continue - if hrow["host"] in ("www.google.com", "books.google.com"): + if hrow["host"] in ( + "www.google.com", + "books.google.com", + "www.loc.gov", + "search.ebscohost.com", + "bibpurl.oclc.org", + "catalog.hathitrust.org", + "www.thefreelibrary.com", + "goo.gl", + "dx.doi.org", + ): # individual books or google searches, not journal/conference homepages + # LOC scanned newspapers continue if "/oai/request" in hrow["url"]: # OAI-PMH endpoints, not homepages |