diff options
Diffstat (limited to 'python/sandcrawler/ingest_fileset.py')
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 40 |
1 files changed, 31 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 542dfbc..3acbece 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -146,11 +146,10 @@ class IngestFilesetWorker(IngestFileWorker): result["status"] = "wayback-content-error" result["error_message"] = str(e)[:1600] return result - except NotImplementedError: - # result['status'] = 'not-implemented' - # result['error_message'] = str(e)[:1600] - # return result - resource = None + except NotImplementedError as e: + result["status"] = "not-implemented" + result["error_message"] = str(e)[:1600] + return result html_biblio = None if resource: @@ -180,7 +179,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -196,7 +195,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop @@ -256,7 +255,7 @@ class IngestFilesetWorker(IngestFileWorker): result["status"] = "wrong-mimetype" return result else: - # raise NotImplementedError() + # eg, datasets, components, etc pass result["_html_biblio"] = html_biblio @@ -378,7 +377,30 @@ class IngestFilesetWorker(IngestFileWorker): return result # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata. - archive_result = strategy_helper.process(dataset_meta) + try: + archive_result = strategy_helper.process(dataset_meta) + except SavePageNowError as e: + result["status"] = "spn2-error" + result["error_message"] = str(e)[:1600] + return result + except PetaboxError as e: + result["status"] = "petabox-error" + result["error_message"] = str(e)[:1600] + return result + except CdxApiError as e: + result["status"] = "cdx-error" + result["error_message"] = str(e)[:1600] + # add a sleep in cdx-error path as a slow-down + time.sleep(2.0) + return result + except WaybackError as e: + result["status"] = "wayback-error" + result["error_message"] = str(e)[:1600] + return result + except WaybackContentError as e: + result["status"] = "wayback-content-error" + result["error_message"] = str(e)[:1600] + return result # 4. Summarize status and return structured result metadata. result["status"] = archive_result.status |