diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 15:44:47 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 15:44:47 -0800 |
commit | fd2d3f95fcdb10084d8c9e52f3696c3095e9dde7 (patch) | |
tree | c44b2a26d495dffa2456007184e74ed2b3c4a52b | |
parent | 55815b2e62a3ce53d5e71d0c6fd676b6cbf5baca (diff) | |
download | sandcrawler-fd2d3f95fcdb10084d8c9e52f3696c3095e9dde7.tar.gz sandcrawler-fd2d3f95fcdb10084d8c9e52f3696c3095e9dde7.zip |
ingest: add more IA domains
-rw-r--r-- | python/sandcrawler/ingest.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 6d8b162..35b37fc 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -76,8 +76,10 @@ class IngestFileWorker(SandcrawlerWorker): # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", + "://www.archive.org/", "://web.archive.org/web/", "://openlibrary.org/", + "://www.openlibrary.org/", "://fatcat.wiki/", # Domain squats |