diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 13:41:21 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 13:41:23 -0700 | 
| commit | 50270a9152c8e88e66187ce755920e35c31bd0b5 (patch) | |
| tree | a412a64b8b0ac138155cdae805f3603c87a3c720 | |
| parent | 69cfb2c38f68fc009d6c7f5107fc36cd7168e69e (diff) | |
| download | sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.tar.gz sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.zip | |
fileset: refactor out tables of helpers
Having these objects invoked in tables resulted in a whole bunch of
objects (including children) getting initialized, which seems like the
wrong thing to do. Defer this until the actual ingest fileset worker is
initialized.
| -rw-r--r-- | python/sandcrawler/fileset_platforms.py | 8 | ||||
| -rw-r--r-- | python/sandcrawler/fileset_strategies.py | 8 | ||||
| -rw-r--r-- | python/sandcrawler/ingest_fileset.py | 24 | 
3 files changed, 19 insertions, 21 deletions
| diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 6ab4781..86e3ff2 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -748,11 +748,3 @@ class ArchiveOrgHelper(FilesetPlatformHelper):              return IngestStrategy.ArchiveorgFileset          else:              raise NotImplementedError("empty dataset") - - -DATASET_PLATFORM_HELPER_TABLE = { -    'dataverse': DataverseHelper(), -    'figshare': FigshareHelper(), -    'zenodo': ZenodoHelper(), -    'archiveorg': ArchiveOrgHelper(), -} diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 4e44d97..9d3bae3 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -294,11 +294,3 @@ class WebFileStrategy(WebFilesetStrategy):          super().__init__(**kwargs)          self.ingest_strategy = IngestStrategy.WebFile          self.success_status = "success-file" - - -FILESET_STRATEGY_HELPER_TABLE = { -    IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), -    IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(), -    IngestStrategy.WebFileset: WebFilesetStrategy(), -    IngestStrategy.WebFile: WebFileStrategy(), -} diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 47a19b8..5728e24 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -6,9 +6,12 @@ from typing import Any, Dict, Optional  import requests  from selectolax.parser import HTMLParser -from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE -from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE -from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError +from sandcrawler.fileset_platforms import (ArchiveOrgHelper, DataverseHelper, FigshareHelper, +                                           ZenodoHelper) +from sandcrawler.fileset_strategies import (ArchiveorgFilesetStrategy, ArchiveorgFileStrategy, +                                            WebFilesetStrategy, WebFileStrategy) +from sandcrawler.fileset_types import (IngestStrategy, PlatformRestrictedError, +                                       PlatformScopeError)  from sandcrawler.html_metadata import html_extract_biblio  from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,                              WaybackError, cdx_to_dict, fix_transfer_encoding) @@ -36,8 +39,19 @@ class IngestFilesetWorker(IngestFileWorker):          super().__init__(sink=None, **kwargs)          self.sink = sink -        self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE -        self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE +        self.dataset_platform_helpers = { +            'dataverse': DataverseHelper(), +            'figshare': FigshareHelper(), +            'zenodo': ZenodoHelper(), +            'archiveorg': ArchiveOrgHelper(), +        } +        self.dataset_strategy_archivers = { +            IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), +            IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(), +            IngestStrategy.WebFileset: WebFilesetStrategy(), +            IngestStrategy.WebFile: WebFileStrategy(), +        } +          self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024)          self.max_file_count = kwargs.get('max_file_count', 200)          self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink') | 
