aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py55
1 files changed, 40 insertions, 15 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 9696f3c..1d84ce5 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -158,22 +158,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
print(f" verifying {m.path}", file=sys.stderr)
file_meta = gen_file_metadata_path(local_path, allow_empty=True)
- assert (
- file_meta["size_bytes"] == m.size
- ), f"expected: {m.size} found: {file_meta['size_bytes']}"
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
if m.sha1:
- assert file_meta["sha1hex"] == m.sha1
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
else:
m.sha1 = file_meta["sha1hex"]
if m.sha256:
- assert file_meta["sha256hex"] == m.sha256
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
else:
m.sha256 = file_meta["sha256hex"]
if m.md5:
- assert file_meta["md5hex"] == m.md5
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
else:
m.md5 = file_meta["md5hex"]
@@ -194,17 +201,27 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
m.mimetype = file_meta["mimetype"]
m.status = "verified-local"
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
# 2. upload all files, with metadata
assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
- item_files = []
+ item_files = {}
for m in item.manifest:
local_path = local_dir + "/" + m.path
- item_files.append(
- {
- "name": local_path,
- "remote_name": m.path,
- }
- )
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
print(
f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
@@ -317,8 +334,16 @@ class WebFilesetStrategy(FilesetIngestStrategy):
else:
assert resource.terminal_status_code == 200
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
file_meta = gen_file_metadata(resource.body)
- file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
if self.ingest_strategy == "web-file":
file_file_meta = file_meta
@@ -332,7 +357,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
continue
m.md5 = m.md5 or file_meta["md5hex"]
- m.sha1 = m.sha1 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
m.sha256 = m.sha256 or file_meta["sha256hex"]
m.mimetype = m.mimetype or file_meta["mimetype"]