aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
blob: 26bc5ad5c9090352ad8098af78726dcf7e22ce5f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

import sys
import json
import gzip
import time
from collections import namedtuple
from typing import Optional, Tuple, Any, Dict, List

import internetarchive

from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult


class FilesetIngestStrategy():

    def __init__(self):
        #self.ingest_strategy = 'unknown'
        pass

    def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
        raise NotImplementedError()

    def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
        raise NotImplementedError()


class ArchiveorgFilesetStrategy(FilesetIngestStrategy):

    def __init__(self):
        self.ingest_strategy = IngestStrategy.ArchiveorgFileset
        self.session = internetarchive.get_session()

    def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
        """
        use API to check for item with all the files in the manifest
        TODO: this naive comparison is quadratic in number of files, aka O(N^2)

        XXX: should this verify sha256 and/or mimetype?
        """
        ia_item = self.session.get_item(item.archiveorg_item_name)
        item_files = ia_item.get_files(on_the_fly=False)
        for wanted in item.manifest:
            found = False
            for existing in item_files:
                if existing.sha1 == wanted.sha1 and existing.name == wanted.path and existing.size == wanted.size:
                    found = True
                    break
            if not found:
                print(f"  didn't find at least one file: {wanted}", file=sys.stderr)
                return None
        return ArchiveStrategyResult(
            ingest_strategy=self.ingest_strategy,
            status='success-existing',
            manifest=item.manifest,
        )

    def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
        existing = self.check_existing(item)
        if existing:
            return existing
        raise NotImplementedError()

FILESET_STRATEGY_HELPER_TABLE = {
    IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
}