diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-11 11:01:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | 2e285e469251125ee70bc4c3408dbbcad8701b2c (patch) | |
tree | 6902a89a29d547cad1e7362ffef2cfe565e52d44 | |
parent | 7a98b12907cff5f6d4a56898b49703289127df21 (diff) | |
download | sandcrawler-2e285e469251125ee70bc4c3408dbbcad8701b2c.tar.gz sandcrawler-2e285e469251125ee70bc4c3408dbbcad8701b2c.zip |
fileset ingest: improve error handling
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 82 | ||||
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/fileset_types.py | 18 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 47 |
4 files changed, 106 insertions, 48 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 73b1676..15976f5 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -40,7 +40,7 @@ class DatasetPlatformHelper(): # XXX: while developing ArchiveorgFileset path #return IngestStrategy.ArchiveorgFileset if len(item.manifest) == 1: - if total_size < 64*1024*1024: + if total_size < 64*1024*1024: return IngestStrategy.WebFile else: return IngestStrategy.ArchiveorgFile @@ -101,38 +101,33 @@ class DataverseHelper(DatasetPlatformHelper): dataset_version = params.get('version') platform_id = params.get('persistentId') if not (platform_id and platform_id[0]): - raise ValueError("Expected a Dataverse persistentId in URL") + raise PlatformScopeError("Expected a Dataverse persistentId in URL") else: platform_id = platform_id[0] + if type(dataset_version) == list: + dataset_version = dataset_version[0] - if not platform_domain in self.dataverse_domain_allowlist: - raise ValueError(f"unexpected dataverse domain: {platform_domain}") - - # for both handle (hdl:) and DOI (doi:) identifiers, the norm is for - # dataverse persistetId is to be structured like: - # <prefix> / <shoulder> / <dataset-id> / <file-id> - if not (platform_id.startswith('doi:10.') or platform_id.startswith('hdl:')): - raise NotImplementedError(f"unsupported dataverse persistentId format: {platform_id}") - dataverse_type = None - if platform_id.count('/') == 2: - dataverse_type = 'dataset' - elif platform_id.count('/') == 3: - dataverse_type = 'file' - else: - raise NotImplementedError(f"unsupported dataverse persistentId format: {platform_id}") + try: + parsed_id = self.parse_dataverse_persistentid(platform_id) + except ValueError: + raise PlatformScopeError(f"not actually in scope") - if dataverse_type != 'dataset': - # XXX - raise NotImplementedError(f"only entire dataverse datasets can be archived with this tool") + if parsed_id['file_id']: + # XXX: maybe we could support this? + raise PlatformScopeError(f"only entire dataverse datasets can be archived with this tool") # 1b. if we didn't get a version number from URL, fetch it from API if not dataset_version: - obj = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}").json() + resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}") + resp.raise_for_status() + obj = resp.json() obj_latest = obj['data']['latestVersion'] dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" # 2. API fetch - obj = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}").json() + resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}") + resp.raise_for_status() + obj = resp.json() obj_latest= obj['data']['latestVersion'] assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" @@ -180,7 +175,9 @@ class DataverseHelper(DatasetPlatformHelper): elif block['typeName'] == 'dsDescription': archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue']['value'] - archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '') + '\n<br>\n' + obj_latest['termsOfUse'] + archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '') + if obj_latest.get('termsOfUse'): + archiveorg_item_meta['description'] += '\n<br>\n' + obj_latest['termsOfUse'] return DatasetPlatformItem( platform_name=self.platform_name, @@ -228,25 +225,26 @@ class FigshareHelper(DatasetPlatformHelper): # 1. extract domain, PID, and version from URL components = urllib.parse.urlparse(url) platform_domain = components.netloc.split(':')[0].lower() - if len(components.path.split('/')) < 6: - raise ValueError("Expected a complete, versioned figshare URL") - platform_id = components.path.split('/')[4] - dataset_version = components.path.split('/')[5] + (platform_id, dataset_version) = self.parse_figshare_url_path(components.path) assert platform_id.isdigit(), f"expected numeric: {platform_id}" assert dataset_version.isdigit(), f"expected numeric: {dataset_version}" - if not 'figshare' in platform_domain: - raise ValueError(f"unexpected figshare domain: {platform_domain}") - # 1b. if we didn't get a version number from URL, fetch it from API - # XXX + # TODO: implement this code path # 2. API fetch - obj = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}").json() + resp = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}") + resp.raise_for_status() + obj = resp.json() figshare_type = obj['defined_type_name'] + if not obj['is_public']: + raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}') + if obj['is_embargoed']: + raise PlatformRestrictedError(f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})') + manifest = [] for row in obj['files']: manifest.append(FilesetManifestFile( @@ -318,29 +316,39 @@ class ZenodoHelper(DatasetPlatformHelper): else: url = request['base_url'] + # XXX: also look in base_url and resource-non-terminal for ident? to + # check for work-level redirects + # 1. extract identifier from URL # eg: https://zenodo.org/record/5230255 components = urllib.parse.urlparse(url) platform_domain = components.netloc.split(':')[0].lower() if len(components.path.split('/')) < 2: - raise ValueError("Expected a complete, versioned figshare URL") + raise PlatformScopeError("Expected a complete, versioned figshare URL") platform_id = components.path.split('/')[2] assert platform_id.isdigit(), f"expected numeric: {platform_id}" if not 'zenodo.org' in platform_domain: - raise ValueError(f"unexpected zenodo.org domain: {platform_domain}") + raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}") # 2. API fetch - obj = self.session.get(f"https://zenodo.org/api/records/{platform_id}").json() + resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}") + if resp.status_code == 410: + raise PlatformRestrictedError('record deleted') + resp.raise_for_status() + obj = resp.json() assert obj['id'] == int(platform_id) - work_id = obj['conceptdoi'] + work_id = obj['conceptrecid'] if work_id == obj['id']: - raise ValueError("got a work-level zenodo record, not a versioned record: {work_id}") + raise PlatformScopeError("got a work-level zenodo record, not a versioned record: {work_id}") zenodo_type = obj['metadata']['resource_type']['type'] + if obj['metadata']['access_right'] != 'open': + raise PlatformRestrictedError("not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}") + manifest = [] for row in obj['files']: mf = FilesetManifestFile( diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 6bda9b4..43f1a53 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -12,7 +12,7 @@ import internetarchive from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding -from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult +from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult, PlatformScopeError from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path @@ -83,7 +83,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): return existing if item.platform_name == 'archiveorg': - raise ValueError("should't download archive.org into itself") + raise PlatformScopeError("should't download archive.org into itself") local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith('/') @@ -142,9 +142,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): m.mimetype = file_meta['mimetype'] m.status = 'verified-local' - # 2. setup archive.org item metadata + # 2. upload all files, with metadata assert item.archiveorg_item_meta['collection'] - # 3. upload all files item_files = [] for m in item.manifest: local_path = local_dir + '/' + m.path diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py index 51000d7..9fe8b0d 100644 --- a/python/sandcrawler/fileset_types.py +++ b/python/sandcrawler/fileset_types.py @@ -42,3 +42,21 @@ class ArchiveStrategyResult(BaseModel): ingest_strategy: str status: str manifest: List[FilesetManifestFile] + +class PlatformScopeError(Exception): + """ + For incidents where platform helper discovers that the fileset/dataset is + out-of-cope after already starting to process it. + + For example, attempting to ingest: + + - a 'latest version' record, when the platform has version-specific records + - a single file within a dataset for a platform which has file-level identifiers + """ + pass + +class PlatformRestrictedError(Exception): + """ + When datasets are not publicly available on a platform (yet) + """ + pass diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index f69fff4..ce6cdca 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -22,6 +22,7 @@ from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.ingest_file import IngestFileWorker from sandcrawler.fileset_platforms import DatasetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE +from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError MAX_BODY_SIZE_BYTES = 128*1024*1024 @@ -46,6 +47,8 @@ class IngestFilesetWorker(IngestFileWorker): self.sink = sink self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE + self.max_total_size = 100*1024*1024*1024 + self.max_file_count = 500 def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]: @@ -263,7 +266,29 @@ class IngestFilesetWorker(IngestFileWorker): terminal_url = base_url if resource: terminal_url = resource.terminal_url - dataset_meta = platform_helper.process_request(request, resource, html_biblio) + + try: + dataset_meta = platform_helper.process_request(request, resource, html_biblio) + except PlatformScopeError as e: + result['status'] = 'platform-scope' + result['error_message'] = str(e)[:1600] + return result + except PlatformRestrictedError as e: + result['status'] = 'platform-restricted' + result['error_message'] = str(e)[:1600] + return result + except NotImplementedError as e: + result['status'] = 'not-implemented' + result['error_message'] = str(e)[:1600] + return result + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + result['status'] = 'platform-404' + result['error_message'] = str(e)[:1600] + return result + else: + raise e + #print(dataset_meta, file=sys.stderr) platform = dataset_meta.platform_name result['platform'] = dataset_meta.platform_name @@ -271,12 +296,20 @@ class IngestFilesetWorker(IngestFileWorker): result['item_name'] = dataset_meta.archiveorg_item_name result['item_meta'] = dataset_meta.archiveorg_item_meta - if dataset_meta.manifest: - result['manifest'] = [m.dict() for m in dataset_meta.manifest] - result['file_count'] = len(dataset_meta.manifest) - result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size]) - else: - result['status'] = 'no-manifest' + if not dataset_meta.manifest: + result['status'] = 'empty-manifest' + return result + + # these will get confirmed/updated after ingest + result['manifest'] = [m.dict() for m in dataset_meta.manifest] + result['file_count'] = len(dataset_meta.manifest) + result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size]) + + if result['total_size'] > self.max_total_size: + result['status'] = 'too-large-size' + return result + if result['file_count'] > self.max_file_count: + result['status'] = 'too-many-files' return result ingest_strategy = platform_helper.chose_strategy(dataset_meta) |