diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 14:13:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 14:13:30 -0700 |
commit | 1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8 (patch) | |
tree | 3cead02b24ff8407cfa4a2d971779aa7336da01d /python | |
parent | 04e7348866ed01e890572650951fd1e7fed108e7 (diff) | |
download | sandcrawler-1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8.tar.gz sandcrawler-1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8.zip |
ingest: more bogus domain patterns
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 1626292..cf87fff 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -116,6 +116,8 @@ class IngestFileWorker(SandcrawlerWorker): self.max_html_resources = 200 self.base_url_blocklist = [ + "://localhost/", + "://127.0.0.1/", # robot blocking / rate-limited "://hkvalidate.perfdrive.com/", "://ieeexplore.ieee.org/", @@ -127,6 +129,7 @@ class IngestFileWorker(SandcrawlerWorker): "://openlibrary.org/", "://www.openlibrary.org/", "://fatcat.wiki/", + "://scholar.archive.org/", "://orcid.org/", # Domain squats "://bartandjones.com", |