aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-15 14:13:30 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-15 14:13:30 -0700
commit1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8 (patch)
tree3cead02b24ff8407cfa4a2d971779aa7336da01d
parent04e7348866ed01e890572650951fd1e7fed108e7 (diff)
downloadsandcrawler-1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8.tar.gz
sandcrawler-1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8.zip
ingest: more bogus domain patterns
-rw-r--r--python/sandcrawler/ingest_file.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 1626292..cf87fff 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -116,6 +116,8 @@ class IngestFileWorker(SandcrawlerWorker):
self.max_html_resources = 200
self.base_url_blocklist = [
+ "://localhost/",
+ "://127.0.0.1/",
# robot blocking / rate-limited
"://hkvalidate.perfdrive.com/",
"://ieeexplore.ieee.org/",
@@ -127,6 +129,7 @@ class IngestFileWorker(SandcrawlerWorker):
"://openlibrary.org/",
"://www.openlibrary.org/",
"://fatcat.wiki/",
+ "://scholar.archive.org/",
"://orcid.org/",
# Domain squats
"://bartandjones.com",