fileset: refactor out tables of helpers

Having these objects invoked in tables resulted in a whole bunch of objects (including children) getting initialized, which seems like the wrong thing to do. Defer this until the actual ingest fileset worker is initialized.
author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 13:41:21 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 13:41:23 -0700
commit: 50270a9152c8e88e66187ce755920e35c31bd0b5 (patch)
tree: a412a64b8b0ac138155cdae805f3603c87a3c720
parent: 69cfb2c38f68fc009d6c7f5107fc36cd7168e69e (diff)
download: sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.tar.gz
sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.zip
3 files changed, 19 insertions, 21 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 6ab4781..86e3ff2 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -748,11 +748,3 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
             return IngestStrategy.ArchiveorgFileset
         else:
             raise NotImplementedError("empty dataset")
-
-
-DATASET_PLATFORM_HELPER_TABLE = {
-    'dataverse': DataverseHelper(),
-    'figshare': FigshareHelper(),
-    'zenodo': ZenodoHelper(),
-    'archiveorg': ArchiveOrgHelper(),
-}
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 4e44d97..9d3bae3 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -294,11 +294,3 @@ class WebFileStrategy(WebFilesetStrategy):
         super().__init__(**kwargs)
         self.ingest_strategy = IngestStrategy.WebFile
         self.success_status = "success-file"
-
-
-FILESET_STRATEGY_HELPER_TABLE = {
-    IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
-    IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
-    IngestStrategy.WebFileset: WebFilesetStrategy(),
-    IngestStrategy.WebFile: WebFileStrategy(),
-}
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 47a19b8..5728e24 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -6,9 +6,12 @@ from typing import Any, Dict, Optional
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.fileset_platforms import (ArchiveOrgHelper, DataverseHelper, FigshareHelper,
+                                           ZenodoHelper)
+from sandcrawler.fileset_strategies import (ArchiveorgFilesetStrategy, ArchiveorgFileStrategy,
+                                            WebFilesetStrategy, WebFileStrategy)
+from sandcrawler.fileset_types import (IngestStrategy, PlatformRestrictedError,
+                                       PlatformScopeError)
 from sandcrawler.html_metadata import html_extract_biblio
 from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,
                             WaybackError, cdx_to_dict, fix_transfer_encoding)
@@ -36,8 +39,19 @@ class IngestFilesetWorker(IngestFileWorker):
         super().__init__(sink=None, **kwargs)
 
         self.sink = sink
-        self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE
-        self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE
+        self.dataset_platform_helpers = {
+            'dataverse': DataverseHelper(),
+            'figshare': FigshareHelper(),
+            'zenodo': ZenodoHelper(),
+            'archiveorg': ArchiveOrgHelper(),
+        }
+        self.dataset_strategy_archivers = {
+            IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+            IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+            IngestStrategy.WebFileset: WebFilesetStrategy(),
+            IngestStrategy.WebFile: WebFileStrategy(),
+        }
+
         self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024)
         self.max_file_count = kwargs.get('max_file_count', 200)
         self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink')
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 13:41:21 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 13:41:23 -0700
commit	50270a9152c8e88e66187ce755920e35c31bd0b5 (patch)
tree	a412a64b8b0ac138155cdae805f3603c87a3c720
parent	69cfb2c38f68fc009d6c7f5107fc36cd7168e69e (diff)
download	sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.tar.gz sandcrawler-50270a9152c8e88e66187ce755920e35c31bd0b5.zip