aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/fileset_platforms.py82
-rw-r--r--python/sandcrawler/fileset_strategies.py7
-rw-r--r--python/sandcrawler/fileset_types.py18
-rw-r--r--python/sandcrawler/ingest_fileset.py47
4 files changed, 106 insertions, 48 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 73b1676..15976f5 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -40,7 +40,7 @@ class DatasetPlatformHelper():
# XXX: while developing ArchiveorgFileset path
#return IngestStrategy.ArchiveorgFileset
if len(item.manifest) == 1:
- if total_size < 64*1024*1024:
+ if total_size < 64*1024*1024:
return IngestStrategy.WebFile
else:
return IngestStrategy.ArchiveorgFile
@@ -101,38 +101,33 @@ class DataverseHelper(DatasetPlatformHelper):
dataset_version = params.get('version')
platform_id = params.get('persistentId')
if not (platform_id and platform_id[0]):
- raise ValueError("Expected a Dataverse persistentId in URL")
+ raise PlatformScopeError("Expected a Dataverse persistentId in URL")
else:
platform_id = platform_id[0]
+ if type(dataset_version) == list:
+ dataset_version = dataset_version[0]
- if not platform_domain in self.dataverse_domain_allowlist:
- raise ValueError(f"unexpected dataverse domain: {platform_domain}")
-
- # for both handle (hdl:) and DOI (doi:) identifiers, the norm is for
- # dataverse persistetId is to be structured like:
- # <prefix> / <shoulder> / <dataset-id> / <file-id>
- if not (platform_id.startswith('doi:10.') or platform_id.startswith('hdl:')):
- raise NotImplementedError(f"unsupported dataverse persistentId format: {platform_id}")
- dataverse_type = None
- if platform_id.count('/') == 2:
- dataverse_type = 'dataset'
- elif platform_id.count('/') == 3:
- dataverse_type = 'file'
- else:
- raise NotImplementedError(f"unsupported dataverse persistentId format: {platform_id}")
+ try:
+ parsed_id = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ raise PlatformScopeError(f"not actually in scope")
- if dataverse_type != 'dataset':
- # XXX
- raise NotImplementedError(f"only entire dataverse datasets can be archived with this tool")
+ if parsed_id['file_id']:
+ # XXX: maybe we could support this?
+ raise PlatformScopeError(f"only entire dataverse datasets can be archived with this tool")
# 1b. if we didn't get a version number from URL, fetch it from API
if not dataset_version:
- obj = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}").json()
+ resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}")
+ resp.raise_for_status()
+ obj = resp.json()
obj_latest = obj['data']['latestVersion']
dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
# 2. API fetch
- obj = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}").json()
+ resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}")
+ resp.raise_for_status()
+ obj = resp.json()
obj_latest= obj['data']['latestVersion']
assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
@@ -180,7 +175,9 @@ class DataverseHelper(DatasetPlatformHelper):
elif block['typeName'] == 'dsDescription':
archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue']['value']
- archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '') + '\n<br>\n' + obj_latest['termsOfUse']
+ archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '')
+ if obj_latest.get('termsOfUse'):
+ archiveorg_item_meta['description'] += '\n<br>\n' + obj_latest['termsOfUse']
return DatasetPlatformItem(
platform_name=self.platform_name,
@@ -228,25 +225,26 @@ class FigshareHelper(DatasetPlatformHelper):
# 1. extract domain, PID, and version from URL
components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
- if len(components.path.split('/')) < 6:
- raise ValueError("Expected a complete, versioned figshare URL")
- platform_id = components.path.split('/')[4]
- dataset_version = components.path.split('/')[5]
+ (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
assert platform_id.isdigit(), f"expected numeric: {platform_id}"
assert dataset_version.isdigit(), f"expected numeric: {dataset_version}"
- if not 'figshare' in platform_domain:
- raise ValueError(f"unexpected figshare domain: {platform_domain}")
-
# 1b. if we didn't get a version number from URL, fetch it from API
- # XXX
+ # TODO: implement this code path
# 2. API fetch
- obj = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}").json()
+ resp = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}")
+ resp.raise_for_status()
+ obj = resp.json()
figshare_type = obj['defined_type_name']
+ if not obj['is_public']:
+ raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
+ if obj['is_embargoed']:
+ raise PlatformRestrictedError(f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})')
+
manifest = []
for row in obj['files']:
manifest.append(FilesetManifestFile(
@@ -318,29 +316,39 @@ class ZenodoHelper(DatasetPlatformHelper):
else:
url = request['base_url']
+ # XXX: also look in base_url and resource-non-terminal for ident? to
+ # check for work-level redirects
+
# 1. extract identifier from URL
# eg: https://zenodo.org/record/5230255
components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
if len(components.path.split('/')) < 2:
- raise ValueError("Expected a complete, versioned figshare URL")
+ raise PlatformScopeError("Expected a complete, versioned figshare URL")
platform_id = components.path.split('/')[2]
assert platform_id.isdigit(), f"expected numeric: {platform_id}"
if not 'zenodo.org' in platform_domain:
- raise ValueError(f"unexpected zenodo.org domain: {platform_domain}")
+ raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
# 2. API fetch
- obj = self.session.get(f"https://zenodo.org/api/records/{platform_id}").json()
+ resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}")
+ if resp.status_code == 410:
+ raise PlatformRestrictedError('record deleted')
+ resp.raise_for_status()
+ obj = resp.json()
assert obj['id'] == int(platform_id)
- work_id = obj['conceptdoi']
+ work_id = obj['conceptrecid']
if work_id == obj['id']:
- raise ValueError("got a work-level zenodo record, not a versioned record: {work_id}")
+ raise PlatformScopeError("got a work-level zenodo record, not a versioned record: {work_id}")
zenodo_type = obj['metadata']['resource_type']['type']
+ if obj['metadata']['access_right'] != 'open':
+ raise PlatformRestrictedError("not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}")
+
manifest = []
for row in obj['files']:
mf = FilesetManifestFile(
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 6bda9b4..43f1a53 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -12,7 +12,7 @@ import internetarchive
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult
+from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult, PlatformScopeError
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path
@@ -83,7 +83,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
return existing
if item.platform_name == 'archiveorg':
- raise ValueError("should't download archive.org into itself")
+ raise PlatformScopeError("should't download archive.org into itself")
local_dir = self.working_dir + item.archiveorg_item_name
assert local_dir.startswith('/')
@@ -142,9 +142,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
m.mimetype = file_meta['mimetype']
m.status = 'verified-local'
- # 2. setup archive.org item metadata
+ # 2. upload all files, with metadata
assert item.archiveorg_item_meta['collection']
- # 3. upload all files
item_files = []
for m in item.manifest:
local_path = local_dir + '/' + m.path
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index 51000d7..9fe8b0d 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -42,3 +42,21 @@ class ArchiveStrategyResult(BaseModel):
ingest_strategy: str
status: str
manifest: List[FilesetManifestFile]
+
+class PlatformScopeError(Exception):
+ """
+ For incidents where platform helper discovers that the fileset/dataset is
+ out-of-cope after already starting to process it.
+
+ For example, attempting to ingest:
+
+ - a 'latest version' record, when the platform has version-specific records
+ - a single file within a dataset for a platform which has file-level identifiers
+ """
+ pass
+
+class PlatformRestrictedError(Exception):
+ """
+ When datasets are not publicly available on a platform (yet)
+ """
+ pass
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index f69fff4..ce6cdca 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -22,6 +22,7 @@ from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.ingest_file import IngestFileWorker
from sandcrawler.fileset_platforms import DatasetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
+from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
MAX_BODY_SIZE_BYTES = 128*1024*1024
@@ -46,6 +47,8 @@ class IngestFilesetWorker(IngestFileWorker):
self.sink = sink
self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE
self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE
+ self.max_total_size = 100*1024*1024*1024
+ self.max_file_count = 500
def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
@@ -263,7 +266,29 @@ class IngestFilesetWorker(IngestFileWorker):
terminal_url = base_url
if resource:
terminal_url = resource.terminal_url
- dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+
+ try:
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ except PlatformScopeError as e:
+ result['status'] = 'platform-scope'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except PlatformRestrictedError as e:
+ result['status'] = 'platform-restricted'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result['status'] = 'not-implemented'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 404:
+ result['status'] = 'platform-404'
+ result['error_message'] = str(e)[:1600]
+ return result
+ else:
+ raise e
+
#print(dataset_meta, file=sys.stderr)
platform = dataset_meta.platform_name
result['platform'] = dataset_meta.platform_name
@@ -271,12 +296,20 @@ class IngestFilesetWorker(IngestFileWorker):
result['item_name'] = dataset_meta.archiveorg_item_name
result['item_meta'] = dataset_meta.archiveorg_item_meta
- if dataset_meta.manifest:
- result['manifest'] = [m.dict() for m in dataset_meta.manifest]
- result['file_count'] = len(dataset_meta.manifest)
- result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size])
- else:
- result['status'] = 'no-manifest'
+ if not dataset_meta.manifest:
+ result['status'] = 'empty-manifest'
+ return result
+
+ # these will get confirmed/updated after ingest
+ result['manifest'] = [m.dict() for m in dataset_meta.manifest]
+ result['file_count'] = len(dataset_meta.manifest)
+ result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size])
+
+ if result['total_size'] > self.max_total_size:
+ result['status'] = 'too-large-size'
+ return result
+ if result['file_count'] > self.max_file_count:
+ result['status'] = 'too-many-files'
return result
ingest_strategy = platform_helper.chose_strategy(dataset_meta)