diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 16:13:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:25 -0700 |
commit | b3447503c0aa2e326ce1e46c993be28f907ec23b (patch) | |
tree | 4bd3de9016ecc95e38de5c75e6fd69b5ce26f74c /python/sandcrawler/fileset_strategies.py | |
parent | 147319ae00a6b788104209083f65cbaa4329c862 (diff) | |
download | sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.tar.gz sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.zip |
progress on dataset ingest
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 57 |
1 files changed, 51 insertions, 6 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 592b475..26bc5ad 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -6,17 +6,62 @@ import time from collections import namedtuple from typing import Optional, Tuple, Any, Dict, List +import internetarchive + from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult -from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem +from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult + + +class FilesetIngestStrategy(): + + def __init__(self): + #self.ingest_strategy = 'unknown' + pass + + def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]: + raise NotImplementedError() + + def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult: + raise NotImplementedError() + + +class ArchiveorgFilesetStrategy(FilesetIngestStrategy): + def __init__(self): + self.ingest_strategy = IngestStrategy.ArchiveorgFileset + self.session = internetarchive.get_session() -class FilesetIngestStrategy(class): + def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]: + """ + use API to check for item with all the files in the manifest + TODO: this naive comparison is quadratic in number of files, aka O(N^2) - def __init__(): - self.ingest_strategy = 'unknown' + XXX: should this verify sha256 and/or mimetype? + """ + ia_item = self.session.get_item(item.archiveorg_item_name) + item_files = ia_item.get_files(on_the_fly=False) + for wanted in item.manifest: + found = False + for existing in item_files: + if existing.sha1 == wanted.sha1 and existing.name == wanted.path and existing.size == wanted.size: + found = True + break + if not found: + print(f" didn't find at least one file: {wanted}", file=sys.stderr) + return None + return ArchiveStrategyResult( + ingest_strategy=self.ingest_strategy, + status='success-existing', + manifest=item.manifest, + ) - def check_existing(): # XXX: -> Any: + def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult: + existing = self.check_existing(item) + if existing: + return existing raise NotImplementedError() - def process(item: DatasetPlatformItem): +FILESET_STRATEGY_HELPER_TABLE = { + IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), +} |