From f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 11:50:26 -0700 Subject: more small fileset ingest tweaks --- python/sandcrawler/fileset_strategies.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'python/sandcrawler/fileset_strategies.py') diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index f2f2fcc..d12fc15 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -32,6 +32,7 @@ class FilesetIngestStrategy(): class ArchiveorgFilesetStrategy(FilesetIngestStrategy): def __init__(self, **kwargs): + super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset # TODO: enable cleanup when confident (eg, safe path parsing) @@ -195,10 +196,12 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy): class WebFilesetStrategy(FilesetIngestStrategy): def __init__(self, **kwargs): + super().__init__() self.ingest_strategy = IngestStrategy.WebFileset self.wayback_client = WaybackClient() self.try_spn2 = True self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + self.max_spn_manifest = 20 def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult: """ @@ -219,10 +222,12 @@ class WebFilesetStrategy(FilesetIngestStrategy): via = "wayback" resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype) - if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')): + if len(item.manifest) > self.max_spn_manifest: + m.status = 'too-much-spn' + continue via = "spn2" - resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client) + resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True) print("[FETCH {:>6}] {} {}".format( via, -- cgit v1.2.3