aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 15:44:47 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 15:44:47 -0800
commitfd2d3f95fcdb10084d8c9e52f3696c3095e9dde7 (patch)
treec44b2a26d495dffa2456007184e74ed2b3c4a52b /python
parent55815b2e62a3ce53d5e71d0c6fd676b6cbf5baca (diff)
downloadsandcrawler-fd2d3f95fcdb10084d8c9e52f3696c3095e9dde7.tar.gz
sandcrawler-fd2d3f95fcdb10084d8c9e52f3696c3095e9dde7.zip
ingest: add more IA domains
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 6d8b162..35b37fc 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -76,8 +76,10 @@ class IngestFileWorker(SandcrawlerWorker):
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
+ "://www.archive.org/",
"://web.archive.org/web/",
"://openlibrary.org/",
+ "://www.openlibrary.org/",
"://fatcat.wiki/",
# Domain squats