1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
import sys
import json
import gzip
import time
from collections import namedtuple
from typing import Optional, Tuple, Any, Dict, List
import internetarchive
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult
class FilesetIngestStrategy():
def __init__(self):
#self.ingest_strategy = 'unknown'
pass
def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
raise NotImplementedError()
def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
raise NotImplementedError()
class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
def __init__(self):
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
self.session = internetarchive.get_session()
def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
"""
use API to check for item with all the files in the manifest
TODO: this naive comparison is quadratic in number of files, aka O(N^2)
XXX: should this verify sha256 and/or mimetype?
"""
ia_item = self.session.get_item(item.archiveorg_item_name)
item_files = ia_item.get_files(on_the_fly=False)
for wanted in item.manifest:
found = False
for existing in item_files:
if existing.sha1 == wanted.sha1 and existing.name == wanted.path and existing.size == wanted.size:
found = True
break
if not found:
print(f" didn't find at least one file: {wanted}", file=sys.stderr)
return None
return ArchiveStrategyResult(
ingest_strategy=self.ingest_strategy,
status='success-existing',
manifest=item.manifest,
)
def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
existing = self.check_existing(item)
if existing:
return existing
raise NotImplementedError()
FILESET_STRATEGY_HELPER_TABLE = {
IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
}
|