aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 19:12:05 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit93adbbb98eff0c148bada84f794783b733c58f73 (patch)
tree0d6e121d7b84238734c5e45e90897c061066a7dd /python/sandcrawler/fileset_strategies.py
parentbff5da3971aa3ad458da048926da5c35252f1fb9 (diff)
downloadsandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.tar.gz
sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.zip
improvements to platform helpers
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 5ee4cc9..6bda9b4 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if existing:
return existing
+ if item.platform_name == 'archiveorg':
+ raise ValueError("should't download archive.org into itself")
+
local_dir = self.working_dir + item.archiveorg_item_name
assert local_dir.startswith('/')
assert local_dir.count('/') > 2
@@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- # XXX: this is copypasta
+ # XXX: this is copypasta, and also should be part of SPN client, not here
self.spn2_simple_get_domains = [
# direct PDF links
"://arxiv.org/pdf/",