From 1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 15 Jul 2022 14:13:30 -0700
Subject: ingest: more bogus domain patterns

---
 python/sandcrawler/ingest_file.py | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'python')

diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 1626292..cf87fff 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -116,6 +116,8 @@ class IngestFileWorker(SandcrawlerWorker):
         self.max_html_resources = 200
 
         self.base_url_blocklist = [
+            "://localhost/",
+            "://127.0.0.1/",
             # robot blocking / rate-limited
             "://hkvalidate.perfdrive.com/",
             "://ieeexplore.ieee.org/",
@@ -127,6 +129,7 @@ class IngestFileWorker(SandcrawlerWorker):
             "://openlibrary.org/",
             "://www.openlibrary.org/",
             "://fatcat.wiki/",
+            "://scholar.archive.org/",
             "://orcid.org/",
             # Domain squats
             "://bartandjones.com",
-- 
cgit v1.2.3