aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/fileset_platforms.py8
-rw-r--r--python/sandcrawler/fileset_strategies.py8
-rw-r--r--python/sandcrawler/ingest_fileset.py24
3 files changed, 19 insertions, 21 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 6ab4781..86e3ff2 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -748,11 +748,3 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
return IngestStrategy.ArchiveorgFileset
else:
raise NotImplementedError("empty dataset")
-
-
-DATASET_PLATFORM_HELPER_TABLE = {
- 'dataverse': DataverseHelper(),
- 'figshare': FigshareHelper(),
- 'zenodo': ZenodoHelper(),
- 'archiveorg': ArchiveOrgHelper(),
-}
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 4e44d97..9d3bae3 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -294,11 +294,3 @@ class WebFileStrategy(WebFilesetStrategy):
super().__init__(**kwargs)
self.ingest_strategy = IngestStrategy.WebFile
self.success_status = "success-file"
-
-
-FILESET_STRATEGY_HELPER_TABLE = {
- IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
- IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
- IngestStrategy.WebFileset: WebFilesetStrategy(),
- IngestStrategy.WebFile: WebFileStrategy(),
-}
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 47a19b8..5728e24 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -6,9 +6,12 @@ from typing import Any, Dict, Optional
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.fileset_platforms import (ArchiveOrgHelper, DataverseHelper, FigshareHelper,
+ ZenodoHelper)
+from sandcrawler.fileset_strategies import (ArchiveorgFilesetStrategy, ArchiveorgFileStrategy,
+ WebFilesetStrategy, WebFileStrategy)
+from sandcrawler.fileset_types import (IngestStrategy, PlatformRestrictedError,
+ PlatformScopeError)
from sandcrawler.html_metadata import html_extract_biblio
from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,
WaybackError, cdx_to_dict, fix_transfer_encoding)
@@ -36,8 +39,19 @@ class IngestFilesetWorker(IngestFileWorker):
super().__init__(sink=None, **kwargs)
self.sink = sink
- self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE
- self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE
+ self.dataset_platform_helpers = {
+ 'dataverse': DataverseHelper(),
+ 'figshare': FigshareHelper(),
+ 'zenodo': ZenodoHelper(),
+ 'archiveorg': ArchiveOrgHelper(),
+ }
+ self.dataset_strategy_archivers = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+ IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(),
+ IngestStrategy.WebFile: WebFileStrategy(),
+ }
+
self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024)
self.max_file_count = kwargs.get('max_file_count', 200)
self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink')