aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-04 16:13:55 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:25 -0700
commitb3447503c0aa2e326ce1e46c993be28f907ec23b (patch)
tree4bd3de9016ecc95e38de5c75e6fd69b5ce26f74c /python/sandcrawler/fileset_strategies.py
parent147319ae00a6b788104209083f65cbaa4329c862 (diff)
downloadsandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.tar.gz
sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.zip
progress on dataset ingest
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py57
1 files changed, 51 insertions, 6 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 592b475..26bc5ad 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -6,17 +6,62 @@ import time
from collections import namedtuple
from typing import Optional, Tuple, Any, Dict, List
+import internetarchive
+
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem
+from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult
+
+
+class FilesetIngestStrategy():
+
+ def __init__(self):
+ #self.ingest_strategy = 'unknown'
+ pass
+
+ def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ raise NotImplementedError()
+
+ def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ raise NotImplementedError()
+
+
+class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self):
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+ self.session = internetarchive.get_session()
-class FilesetIngestStrategy(class):
+ def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ """
+ use API to check for item with all the files in the manifest
+ TODO: this naive comparison is quadratic in number of files, aka O(N^2)
- def __init__():
- self.ingest_strategy = 'unknown'
+ XXX: should this verify sha256 and/or mimetype?
+ """
+ ia_item = self.session.get_item(item.archiveorg_item_name)
+ item_files = ia_item.get_files(on_the_fly=False)
+ for wanted in item.manifest:
+ found = False
+ for existing in item_files:
+ if existing.sha1 == wanted.sha1 and existing.name == wanted.path and existing.size == wanted.size:
+ found = True
+ break
+ if not found:
+ print(f" didn't find at least one file: {wanted}", file=sys.stderr)
+ return None
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status='success-existing',
+ manifest=item.manifest,
+ )
- def check_existing(): # XXX: -> Any:
+ def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ existing = self.check_existing(item)
+ if existing:
+ return existing
raise NotImplementedError()
- def process(item: DatasetPlatformItem):
+FILESET_STRATEGY_HELPER_TABLE = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+}