aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_fileset.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest_fileset.py')
-rw-r--r--python/sandcrawler/ingest_fileset.py40
1 files changed, 31 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 542dfbc..3acbece 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -146,11 +146,10 @@ class IngestFilesetWorker(IngestFileWorker):
result["status"] = "wayback-content-error"
result["error_message"] = str(e)[:1600]
return result
- except NotImplementedError:
- # result['status'] = 'not-implemented'
- # result['error_message'] = str(e)[:1600]
- # return result
- resource = None
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
html_biblio = None
if resource:
@@ -180,7 +179,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -196,7 +195,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop
@@ -256,7 +255,7 @@ class IngestFilesetWorker(IngestFileWorker):
result["status"] = "wrong-mimetype"
return result
else:
- # raise NotImplementedError()
+ # eg, datasets, components, etc
pass
result["_html_biblio"] = html_biblio
@@ -378,7 +377,30 @@ class IngestFilesetWorker(IngestFileWorker):
return result
# 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
- archive_result = strategy_helper.process(dataset_meta)
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
# 4. Summarize status and return structured result metadata.
result["status"] = archive_result.status