diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-03-29 19:15:53 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-03-29 19:15:53 -0700 |
commit | 51193896a4958025f1bf045b0ba908ab5d4d0553 (patch) | |
tree | 21e6a7397a171c21e8a17a665325215569264860 | |
parent | ee16a661cab4220d3ae4535b40c01adbf3bd5f86 (diff) | |
download | sandcrawler-51193896a4958025f1bf045b0ba908ab5d4d0553.tar.gz sandcrawler-51193896a4958025f1bf045b0ba908ab5d4d0553.zip |
filesets: fix archive.org path naming
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index d2a8e99..1d84ce5 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -212,15 +212,16 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): # 2. upload all files, with metadata assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"] - item_files = [] + item_files = {} for m in item.manifest: local_path = local_dir + "/" + m.path - item_files.append( - { - "name": local_path, - "remote_name": m.path, - } - ) + if m.path == "name": + raise NotImplementedError( + "fileset file path is 'name', which is a reserved keyword" + ) + item_files[m.path] = local_path + if len(item_files) != len(item.manifest): + raise NotImplementedError("file/manifest length mismatch: duplicated file paths?") print( f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", |