diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 15:01:20 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 15:01:20 -0700 |
commit | 8c23bec37e410defa219650e13bb5b2aa3b3c974 (patch) | |
tree | 1f738d05f47825734eb32eed7702f8c127b15123 | |
parent | c7c51a5e6fb17f04a96b2c7536c6acccf929865f (diff) | |
download | chocula-8c23bec37e410defa219650e13bb5b2aa3b3c974.tar.gz chocula-8c23bec37e410defa219650e13bb5b2aa3b3c974.zip |
filter out more meta/index URL hosts
-rw-r--r-- | chocula/database.py | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/chocula/database.py b/chocula/database.py index 4606212..2588f60 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -48,6 +48,7 @@ class HomepageUrl: not url or "mailto:" in url.lower() or url.lower() in ("http://n/a", "http://na/", "http://na") + or "LOCKSS_RESOLVER" in url ): return None if url.startswith("www."): @@ -681,12 +682,25 @@ class ChoculaDatabase: continue if "://www.ncbi.nlm.nih.gov/" in hrow["url"]: continue + if "LOCKSS_RESOLVER" in hrow["url"]: + continue if "web.archive.org/web" in hrow["url"]: webarchive_urls.append(hrow["url"]) urls.append(hrow["url"]) continue - if hrow["host"] in ("www.google.com", "books.google.com"): + if hrow["host"] in ( + "www.google.com", + "books.google.com", + "www.loc.gov", + "search.ebscohost.com", + "bibpurl.oclc.org", + "catalog.hathitrust.org", + "www.thefreelibrary.com", + "goo.gl", + "dx.doi.org", + ): # individual books or google searches, not journal/conference homepages + # LOC scanned newspapers continue if "/oai/request" in hrow["url"]: # OAI-PMH endpoints, not homepages |