aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-03-29 19:15:53 -0700
committerBryan Newbold <bnewbold@archive.org>2022-03-29 19:15:53 -0700
commit51193896a4958025f1bf045b0ba908ab5d4d0553 (patch)
tree21e6a7397a171c21e8a17a665325215569264860
parentee16a661cab4220d3ae4535b40c01adbf3bd5f86 (diff)
downloadsandcrawler-51193896a4958025f1bf045b0ba908ab5d4d0553.tar.gz
sandcrawler-51193896a4958025f1bf045b0ba908ab5d4d0553.zip
filesets: fix archive.org path naming
-rw-r--r--python/sandcrawler/fileset_strategies.py15
1 files changed, 8 insertions, 7 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index d2a8e99..1d84ce5 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -212,15 +212,16 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
# 2. upload all files, with metadata
assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
- item_files = []
+ item_files = {}
for m in item.manifest:
local_path = local_dir + "/" + m.path
- item_files.append(
- {
- "name": local_path,
- "remote_name": m.path,
- }
- )
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
print(
f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",