diff options
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 55 |
1 files changed, 40 insertions, 15 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9696f3c..1d84ce5 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -158,22 +158,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): print(f" verifying {m.path}", file=sys.stderr) file_meta = gen_file_metadata_path(local_path, allow_empty=True) - assert ( - file_meta["size_bytes"] == m.size - ), f"expected: {m.size} found: {file_meta['size_bytes']}" + if file_meta["size_bytes"] != m.size: + print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr) + m.status = "mismatch-size" + continue if m.sha1: - assert file_meta["sha1hex"] == m.sha1 + if file_meta["sha1hex"] != m.sha1: + m.status = "mismatch-sha1" + continue else: m.sha1 = file_meta["sha1hex"] if m.sha256: - assert file_meta["sha256hex"] == m.sha256 + if file_meta["sha256hex"] != m.sha256: + m.status = "mismatch-sha256" + continue else: m.sha256 = file_meta["sha256hex"] if m.md5: - assert file_meta["md5hex"] == m.md5 + if file_meta["md5hex"] != m.md5: + m.status = "mismatch-md5" + continue else: m.md5 = file_meta["md5hex"] @@ -194,17 +201,27 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): m.mimetype = file_meta["mimetype"] m.status = "verified-local" + # if verification failed for any individual files, bail out + for m in item.manifest: + if m.status != "verified-local": + return ArchiveStrategyResult( + ingest_strategy=self.ingest_strategy, + manifest=item.manifest, + status=m.status, + ) + # 2. upload all files, with metadata assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"] - item_files = [] + item_files = {} for m in item.manifest: local_path = local_dir + "/" + m.path - item_files.append( - { - "name": local_path, - "remote_name": m.path, - } - ) + if m.path == "name": + raise NotImplementedError( + "fileset file path is 'name', which is a reserved keyword" + ) + item_files[m.path] = local_path + if len(item_files) != len(item.manifest): + raise NotImplementedError("file/manifest length mismatch: duplicated file paths?") print( f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", @@ -317,8 +334,16 @@ class WebFilesetStrategy(FilesetIngestStrategy): else: assert resource.terminal_status_code == 200 + if not resource.body: + m.status = "empty-blob" + continue + file_meta = gen_file_metadata(resource.body) - file_meta, html_resource = fix_transfer_encoding(file_meta, resource) + try: + file_meta, _html_resource = fix_transfer_encoding(file_meta, resource) + except Exception: + m.status = "transfer-encoding-error" + continue if self.ingest_strategy == "web-file": file_file_meta = file_meta @@ -332,7 +357,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): continue m.md5 = m.md5 or file_meta["md5hex"] - m.sha1 = m.sha1 or file_meta["md5hex"] + m.sha1 = m.sha1 or file_meta["sha1hex"] m.sha256 = m.sha256 or file_meta["sha256hex"] m.mimetype = m.mimetype or file_meta["mimetype"] |