aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 15:01:20 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 15:01:20 -0700
commit8c23bec37e410defa219650e13bb5b2aa3b3c974 (patch)
tree1f738d05f47825734eb32eed7702f8c127b15123
parentc7c51a5e6fb17f04a96b2c7536c6acccf929865f (diff)
downloadchocula-8c23bec37e410defa219650e13bb5b2aa3b3c974.tar.gz
chocula-8c23bec37e410defa219650e13bb5b2aa3b3c974.zip
filter out more meta/index URL hosts
-rw-r--r--chocula/database.py16
1 files changed, 15 insertions, 1 deletions
diff --git a/chocula/database.py b/chocula/database.py
index 4606212..2588f60 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -48,6 +48,7 @@ class HomepageUrl:
not url
or "mailto:" in url.lower()
or url.lower() in ("http://n/a", "http://na/", "http://na")
+ or "LOCKSS_RESOLVER" in url
):
return None
if url.startswith("www."):
@@ -681,12 +682,25 @@ class ChoculaDatabase:
continue
if "://www.ncbi.nlm.nih.gov/" in hrow["url"]:
continue
+ if "LOCKSS_RESOLVER" in hrow["url"]:
+ continue
if "web.archive.org/web" in hrow["url"]:
webarchive_urls.append(hrow["url"])
urls.append(hrow["url"])
continue
- if hrow["host"] in ("www.google.com", "books.google.com"):
+ if hrow["host"] in (
+ "www.google.com",
+ "books.google.com",
+ "www.loc.gov",
+ "search.ebscohost.com",
+ "bibpurl.oclc.org",
+ "catalog.hathitrust.org",
+ "www.thefreelibrary.com",
+ "goo.gl",
+ "dx.doi.org",
+ ):
# individual books or google searches, not journal/conference homepages
+ # LOC scanned newspapers
continue
if "/oai/request" in hrow["url"]:
# OAI-PMH endpoints, not homepages