diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2022-07-07 13:17:49 -0700 |
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-07 13:17:49 -0700 |
| commit | 695a80a64f02f4c23bb938ecfffeef146344841f (patch) | |
| tree | 8a8aef3b3e9880bb6404039f0423b22bcdb0b7aa | |
| parent | fcc5a1648d2e49e7002ca569ed668d3318a75584 (diff) | |
| download | sandcrawler-695a80a64f02f4c23bb938ecfffeef146344841f.tar.gz sandcrawler-695a80a64f02f4c23bb938ecfffeef146344841f.zip | |
ingest: IEEE domain is blocking us
| -rw-r--r-- | python/sandcrawler/ingest_file.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index eca8bf8..c79973f 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -116,8 +116,9 @@ class IngestFileWorker(SandcrawlerWorker): self.max_html_resources = 200 self.base_url_blocklist = [ - # robot blocking + # robot blocking / rate-limited "://hkvalidate.perfdrive.com/", + "://ieeexplore.ieee.org/", # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://www.archive.org/", |
