From 93adbbb98eff0c148bada84f794783b733c58f73 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 6 Oct 2021 19:12:05 -0700 Subject: improvements to platform helpers --- python/sandcrawler/fileset_strategies.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/fileset_strategies.py') diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 5ee4cc9..6bda9b4 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if existing: return existing + if item.platform_name == 'archiveorg': + raise ValueError("should't download archive.org into itself") + local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith('/') assert local_dir.count('/') > 2 @@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.try_spn2 = True self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) - # XXX: this is copypasta + # XXX: this is copypasta, and also should be part of SPN client, not here self.spn2_simple_get_domains = [ # direct PDF links "://arxiv.org/pdf/", -- cgit v1.2.3