aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-15 13:17:23 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit6cccac03451f46cb59897871e6631debca558771 (patch)
treee846a634d5a69584aab1902455dab1cf59f7b28c /python/sandcrawler/fileset_strategies.py
parent84179e60f747070f7a2424e4deccaee2eb096605 (diff)
downloadsandcrawler-6cccac03451f46cb59897871e6631debca558771.tar.gz
sandcrawler-6cccac03451f46cb59897871e6631debca558771.zip
filesets: iteration of implementation and docs
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py23
1 files changed, 13 insertions, 10 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 43f1a53..2577d2b 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -12,7 +12,7 @@ import internetarchive
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult, PlatformScopeError
+from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path
@@ -22,10 +22,10 @@ class FilesetIngestStrategy():
#self.ingest_strategy = 'unknown'
pass
- def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
raise NotImplementedError()
- def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
raise NotImplementedError()
@@ -44,7 +44,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
self.ia_session = internetarchive.get_session()
- def check_existing(self, item: DatasetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
"""
use API to check for item with all the files in the manifest
@@ -52,8 +52,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
"""
ia_item = self.ia_session.get_item(item.archiveorg_item_name)
if not ia_item.exists:
- return False
+ return None
item_files = ia_item.get_files(on_the_fly=False)
+ assert item.manifest
for wanted in item.manifest:
found = False
for existing in item_files:
@@ -74,7 +75,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
manifest=item.manifest,
)
- def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
May require extra context to pass along to archive.org item creation.
"""
@@ -94,6 +95,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
pass
# 1. download all files locally
+ assert item.manifest
for m in item.manifest:
# XXX: enforce safe/sane filename
@@ -143,7 +145,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
m.status = 'verified-local'
# 2. upload all files, with metadata
- assert item.archiveorg_item_meta['collection']
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta['collection']
item_files = []
for m in item.manifest:
local_path = local_dir + '/' + m.path
@@ -212,7 +214,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
"://s3-eu-west-1.amazonaws.com/",
]
- def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
@@ -220,6 +222,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
- full fetch_resource() method which can do SPN requests
"""
+ assert item.manifest
for m in item.manifest:
fetch_url = m.platform_url
if not fetch_url:
@@ -241,7 +244,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
print("[FETCH {:>6}] {} {}".format(
via,
(resource and resource.status),
- (resource and resource.terminal_url) or url),
+ (resource and resource.terminal_url) or fetch_url),
file=sys.stderr)
m.terminal_url = resource.terminal_url
@@ -268,7 +271,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
overall_status = "success"
for m in item.manifest:
if m.status != 'success':
- overall_status = m.status
+ overall_status = m.status or 'not-processed'
break
if not item.manifest:
overall_status = 'empty-manifest'