aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-15 13:30:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commitba324ae5a6051c47d4cf7524c28caeda7abd6fc5 (patch)
tree8dfd640ddec952b37b9ad0438ee54cb7594d6b85 /python/sandcrawler/fileset_strategies.py
parent6cccac03451f46cb59897871e6631debca558771 (diff)
downloadsandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.tar.gz
sandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.zip
move SPNv2 'simple_get' logic to SPN client
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py24
1 files changed, 1 insertions, 23 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 2577d2b..d1193ee 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -197,23 +197,6 @@ class WebFilesetStrategy(FilesetIngestStrategy):
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- # XXX: this is copypasta, and also should be part of SPN client, not here
- self.spn2_simple_get_domains = [
- # direct PDF links
- "://arxiv.org/pdf/",
- "://europepmc.org/backend/ptpmcrender.fcgi",
- "://pdfs.semanticscholar.org/",
- "://res.mdpi.com/",
-
- # platform sites
- "://zenodo.org/",
- "://figshare.org/",
- "://springernature.figshare.com/",
-
- # popular simple cloud storage or direct links
- "://s3-eu-west-1.amazonaws.com/",
- ]
-
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
@@ -234,12 +217,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
via = "spn2"
- force_simple_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in fetch_url:
- force_simple_get = 1
- break
- resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=force_simple_get)
+ resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
print("[FETCH {:>6}] {} {}".format(
via,