aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-11 20:22:10 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-11 20:22:10 -0800
commitc48eb50782854f95fa16fc0fdf5339c52c830314 (patch)
treee047ab7bef6e6b2189694adb3ea72ef5e9757935
parente5c021bfeb03c50924160616dc64d44617d45933 (diff)
downloadsandcrawler-c48eb50782854f95fa16fc0fdf5339c52c830314.tar.gz
sandcrawler-c48eb50782854f95fa16fc0fdf5339c52c830314.zip
fileset ingest: actually use spn2 CLI flag
-rw-r--r--python/sandcrawler/fileset_strategies.py2
-rw-r--r--python/sandcrawler/ingest_fileset.py5
2 files changed, 4 insertions, 3 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index cc88da2..6e4ad10 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -222,7 +222,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
super().__init__()
self.ingest_strategy = IngestStrategy.WebFileset
self.wayback_client = WaybackClient()
- self.try_spn2 = True
+ self.try_spn2 = kwargs.get("try_spn2", True)
self.spn_client = SavePageNowClient(
spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
)
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 227f511..732a6ab 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -57,6 +57,7 @@ class IngestFilesetWorker(IngestFileWorker):
def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__(sink=None, **kwargs)
+ self.try_spn2 = kwargs.get("try_spn2", True)
self.sink = sink
self.dataset_platform_helpers = {
"dataverse": DataverseHelper(),
@@ -67,8 +68,8 @@ class IngestFilesetWorker(IngestFileWorker):
self.dataset_strategy_archivers = {
IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
- IngestStrategy.WebFileset: WebFilesetStrategy(),
- IngestStrategy.WebFile: WebFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2),
+ IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2),
}
self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024)