diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-12-11 20:22:10 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-12-11 20:22:10 -0800 |
commit | c48eb50782854f95fa16fc0fdf5339c52c830314 (patch) | |
tree | e047ab7bef6e6b2189694adb3ea72ef5e9757935 | |
parent | e5c021bfeb03c50924160616dc64d44617d45933 (diff) | |
download | sandcrawler-c48eb50782854f95fa16fc0fdf5339c52c830314.tar.gz sandcrawler-c48eb50782854f95fa16fc0fdf5339c52c830314.zip |
fileset ingest: actually use spn2 CLI flag
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 5 |
2 files changed, 4 insertions, 3 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index cc88da2..6e4ad10 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -222,7 +222,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): super().__init__() self.ingest_strategy = IngestStrategy.WebFileset self.wayback_client = WaybackClient() - self.try_spn2 = True + self.try_spn2 = kwargs.get("try_spn2", True) self.spn_client = SavePageNowClient( spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0) ) diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 227f511..732a6ab 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -57,6 +57,7 @@ class IngestFilesetWorker(IngestFileWorker): def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__(sink=None, **kwargs) + self.try_spn2 = kwargs.get("try_spn2", True) self.sink = sink self.dataset_platform_helpers = { "dataverse": DataverseHelper(), @@ -67,8 +68,8 @@ class IngestFilesetWorker(IngestFileWorker): self.dataset_strategy_archivers = { IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(), - IngestStrategy.WebFileset: WebFilesetStrategy(), - IngestStrategy.WebFile: WebFileStrategy(), + IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2), + IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2), } self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024) |