aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_platforms.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 19:12:29 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit9409b9436c12c6a7a9956a0a20139d8d6776715a (patch)
tree8b45eda41a496c510ddb42257122b63e4b68e858 /python/sandcrawler/fileset_platforms.py
parent93adbbb98eff0c148bada84f794783b733c58f73 (diff)
downloadsandcrawler-9409b9436c12c6a7a9956a0a20139d8d6776715a.tar.gz
sandcrawler-9409b9436c12c6a7a9956a0a20139d8d6776715a.zip
initial figshare platform helper
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r--python/sandcrawler/fileset_platforms.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index ac8a3af..79f4033 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -195,6 +195,100 @@ class DataverseHelper(DatasetPlatformHelper):
extra=dict(version=dataset_version),
)
+class FigshareHelper(DatasetPlatformHelper):
+
+ def __init__(self):
+ self.platform_name = 'figshare'
+ self.session = requests.Session()
+
+ def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(':')[0].lower()
+ # only work with full, versioned figshare URLs
+ if 'figshare.com' in platform_domain and '/articles/' in components.path and len(components.path.split('/')) >= 6:
+ return True
+ return False
+
+ def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(':')[0].lower()
+ if len(components.path.split('/')) < 6:
+ raise ValueError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split('/')[4]
+ dataset_version = components.path.split('/')[5]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+ assert dataset_version.isdigit(), f"expected numeric: {dataset_version}"
+
+ if not 'figshare' in platform_domain:
+ raise ValueError(f"unexpected figshare domain: {platform_domain}")
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ # XXX
+
+ # 2. API fetch
+ obj = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}").json()
+
+ figshare_type = obj['defined_type_name']
+
+ manifest = []
+ for row in obj['files']:
+ manifest.append(FilesetManifestFile(
+ path=row['name'],
+ size=row['size'],
+ md5=row['computed_md5'],
+ # NOTE: don't get: sha1, sha256, mimetype
+ platform_url=row['download_url'],
+ #extra=dict(),
+ ))
+ assert not row.get('is_link_only')
+
+ authors = []
+ for author in obj['authors']:
+ authors.append(author['full_name'])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # XXX: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj['doi'],
+ title=obj['title'],
+ date=obj['published_date'],
+ source=obj['url_public_html'],
+ description=obj['description'],
+ license=obj['license']['url'],
+ version=obj['version'],
+ )
+
+ return DatasetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status='success',
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
class ArchiveOrgHelper(DatasetPlatformHelper):
@@ -341,5 +435,6 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
DATASET_PLATFORM_HELPER_TABLE = {
'dataverse': DataverseHelper(),
+ 'figshare': FigshareHelper(),
'archiveorg': ArchiveOrgHelper(),
}