aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 11:50:26 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 11:50:26 -0700
commitf9a263a1c0155fc59386fc36f7f4ce25dfc7b23c (patch)
tree2baa3cb10971bf3549df04266c2b4e8e3a879650 /python/sandcrawler/fileset_strategies.py
parent12d041b781912dc376444198c920ade2d6cee7c8 (diff)
downloadsandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.tar.gz
sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.zip
more small fileset ingest tweaks
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index f2f2fcc..d12fc15 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -32,6 +32,7 @@ class FilesetIngestStrategy():
class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
def __init__(self, **kwargs):
+ super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
# TODO: enable cleanup when confident (eg, safe path parsing)
@@ -195,10 +196,12 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
class WebFilesetStrategy(FilesetIngestStrategy):
def __init__(self, **kwargs):
+ super().__init__()
self.ingest_strategy = IngestStrategy.WebFileset
self.wayback_client = WaybackClient()
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+ self.max_spn_manifest = 20
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
@@ -219,10 +222,12 @@ class WebFilesetStrategy(FilesetIngestStrategy):
via = "wayback"
resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
-
if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = 'too-much-spn'
+ continue
via = "spn2"
- resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
+ resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True)
print("[FETCH {:>6}] {} {}".format(
via,