From 50270a9152c8e88e66187ce755920e35c31bd0b5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 13:41:21 -0700 Subject: fileset: refactor out tables of helpers Having these objects invoked in tables resulted in a whole bunch of objects (including children) getting initialized, which seems like the wrong thing to do. Defer this until the actual ingest fileset worker is initialized. --- python/sandcrawler/fileset_platforms.py | 8 -------- python/sandcrawler/fileset_strategies.py | 8 -------- python/sandcrawler/ingest_fileset.py | 24 +++++++++++++++++++----- 3 files changed, 19 insertions(+), 21 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 6ab4781..86e3ff2 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -748,11 +748,3 @@ class ArchiveOrgHelper(FilesetPlatformHelper): return IngestStrategy.ArchiveorgFileset else: raise NotImplementedError("empty dataset") - - -DATASET_PLATFORM_HELPER_TABLE = { - 'dataverse': DataverseHelper(), - 'figshare': FigshareHelper(), - 'zenodo': ZenodoHelper(), - 'archiveorg': ArchiveOrgHelper(), -} diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 4e44d97..9d3bae3 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -294,11 +294,3 @@ class WebFileStrategy(WebFilesetStrategy): super().__init__(**kwargs) self.ingest_strategy = IngestStrategy.WebFile self.success_status = "success-file" - - -FILESET_STRATEGY_HELPER_TABLE = { - IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), - IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(), - IngestStrategy.WebFileset: WebFilesetStrategy(), - IngestStrategy.WebFile: WebFileStrategy(), -} diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 47a19b8..5728e24 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -6,9 +6,12 @@ from typing import Any, Dict, Optional import requests from selectolax.parser import HTMLParser -from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE -from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE -from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError +from sandcrawler.fileset_platforms import (ArchiveOrgHelper, DataverseHelper, FigshareHelper, + ZenodoHelper) +from sandcrawler.fileset_strategies import (ArchiveorgFilesetStrategy, ArchiveorgFileStrategy, + WebFilesetStrategy, WebFileStrategy) +from sandcrawler.fileset_types import (IngestStrategy, PlatformRestrictedError, + PlatformScopeError) from sandcrawler.html_metadata import html_extract_biblio from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError, WaybackError, cdx_to_dict, fix_transfer_encoding) @@ -36,8 +39,19 @@ class IngestFilesetWorker(IngestFileWorker): super().__init__(sink=None, **kwargs) self.sink = sink - self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE - self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE + self.dataset_platform_helpers = { + 'dataverse': DataverseHelper(), + 'figshare': FigshareHelper(), + 'zenodo': ZenodoHelper(), + 'archiveorg': ArchiveOrgHelper(), + } + self.dataset_strategy_archivers = { + IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), + IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(), + IngestStrategy.WebFileset: WebFilesetStrategy(), + IngestStrategy.WebFile: WebFileStrategy(), + } + self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024) self.max_file_count = kwargs.get('max_file_count', 200) self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink') -- cgit v1.2.3