aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-07 15:57:42 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit7a98b12907cff5f6d4a56898b49703289127df21 (patch)
treef5861d624583c6595f72e5978bec4add764ef1e2
parent9409b9436c12c6a7a9956a0a20139d8d6776715a (diff)
downloadsandcrawler-7a98b12907cff5f6d4a56898b49703289127df21.tar.gz
sandcrawler-7a98b12907cff5f6d4a56898b49703289127df21.zip
initial implementation of zenodo platform import
-rw-r--r--python/sandcrawler/fileset_platforms.py100
1 files changed, 100 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 79f4033..73b1676 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -289,6 +289,105 @@ class FigshareHelper(DatasetPlatformHelper):
extra=dict(version=dataset_version),
)
+class ZenodoHelper(DatasetPlatformHelper):
+
+ def __init__(self):
+ self.platform_name = 'zenodo'
+ self.session = requests.Session()
+
+ def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(':')[0].lower()
+ if platform_domain == 'zenodo.org' and '/record/' in components.path:
+ return True
+ return False
+
+ def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
+
+ # 1. extract identifier from URL
+ # eg: https://zenodo.org/record/5230255
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(':')[0].lower()
+ if len(components.path.split('/')) < 2:
+ raise ValueError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split('/')[2]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+
+ if not 'zenodo.org' in platform_domain:
+ raise ValueError(f"unexpected zenodo.org domain: {platform_domain}")
+
+ # 2. API fetch
+ obj = self.session.get(f"https://zenodo.org/api/records/{platform_id}").json()
+
+ assert obj['id'] == int(platform_id)
+ work_id = obj['conceptdoi']
+ if work_id == obj['id']:
+ raise ValueError("got a work-level zenodo record, not a versioned record: {work_id}")
+
+ zenodo_type = obj['metadata']['resource_type']['type']
+
+ manifest = []
+ for row in obj['files']:
+ mf = FilesetManifestFile(
+ path=row['key'],
+ size=row['size'],
+ platform_url=row['links']['self'],
+ #extra=dict(),
+ )
+ checksum = row['checksum']
+ # eg: md5:35ffcab905f8224556dba76648cb7dad
+ if checksum.startswith('md5:'):
+ mf.md5 = checksum[4:]
+ elif checksum.startswith('sha1:'):
+ mf.sha1 = checksum[45]
+ manifest.append(mf)
+
+ authors = []
+ for author in obj['metadata']['creators']:
+ authors.append(author['name'])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}"
+ archiveorg_item_meta = dict(
+ # XXX: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj['doi'],
+ title=obj['metadata']['title'],
+ date=obj['metadata']['publication_date'],
+ source=obj['links']['html'],
+ description=obj['metadata']['description'],
+ license=obj['metadata']['license']['id'],
+ version=obj['revision'],
+ # obj['metadata']['version'] is, eg, git version tag
+ )
+
+ return DatasetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status='success',
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ #web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=obj['revision']),
+ )
+
class ArchiveOrgHelper(DatasetPlatformHelper):
@@ -436,5 +535,6 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
DATASET_PLATFORM_HELPER_TABLE = {
'dataverse': DataverseHelper(),
'figshare': FigshareHelper(),
+ 'zenodo': ZenodoHelper(),
'archiveorg': ArchiveOrgHelper(),
}