diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-06 19:12:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | 93adbbb98eff0c148bada84f794783b733c58f73 (patch) | |
tree | 0d6e121d7b84238734c5e45e90897c061066a7dd /python/sandcrawler/fileset_strategies.py | |
parent | bff5da3971aa3ad458da048926da5c35252f1fb9 (diff) | |
download | sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.tar.gz sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.zip |
improvements to platform helpers
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 5ee4cc9..6bda9b4 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if existing: return existing + if item.platform_name == 'archiveorg': + raise ValueError("should't download archive.org into itself") + local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith('/') assert local_dir.count('/') > 2 @@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.try_spn2 = True self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) - # XXX: this is copypasta + # XXX: this is copypasta, and also should be part of SPN client, not here self.spn2_simple_get_domains = [ # direct PDF links "://arxiv.org/pdf/", |