aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_platforms.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
commit826c7538e091fac14d987a3cd654975da964e240 (patch)
tree90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/fileset_platforms.py
parent020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
downloadsandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
make fmt (black 21.9b0)
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r--python/sandcrawler/fileset_platforms.py463
1 files changed, 262 insertions, 201 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 86e3ff2..fbe8066 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -4,25 +4,38 @@ from typing import Optional, Tuple
import internetarchive
import requests
-from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
- PlatformRestrictedError, PlatformScopeError)
+from sandcrawler.fileset_types import (
+ FilesetManifestFile,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
-class FilesetPlatformHelper():
+class FilesetPlatformHelper:
def __init__(self):
- self.platform_name = 'unknown'
-
- def match_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> bool:
+ self.platform_name = "unknown"
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
"""
Does this request look like it matches this platform?
"""
raise NotImplementedError()
- def process_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
"""
@@ -47,7 +60,7 @@ class FilesetPlatformHelper():
class DataverseHelper(FilesetPlatformHelper):
def __init__(self):
super().__init__()
- self.platform_name = 'dataverse'
+ self.platform_name = "dataverse"
self.session = requests.Session()
@staticmethod
@@ -69,16 +82,16 @@ class DataverseHelper(FilesetPlatformHelper):
If there is an error parsing, raises a ValueError
"""
id_type = None
- if pid.startswith('doi:10.'):
- id_type = 'doi'
+ if pid.startswith("doi:10."):
+ id_type = "doi"
pid = pid[4:]
- elif pid.startswith('hdl:'):
- id_type = 'hdl'
+ elif pid.startswith("hdl:"):
+ id_type = "hdl"
pid = pid[4:]
else:
raise ValueError(f"unknown dataverse persistentId format: {pid}")
- comp = pid.split('/')
+ comp = pid.split("/")
if len(comp) < 2:
raise ValueError(f"unknown dataverse persistentId format: {pid}")
@@ -114,19 +127,23 @@ class DataverseHelper(FilesetPlatformHelper):
"file_id": file_id,
}
- def match_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> bool:
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
# TODO: could also do HTML platform detection or something?
components = urllib.parse.urlparse(url)
# platform_domain = components.netloc.split(':')[0].lower()
params = urllib.parse.parse_qs(components.query)
- id_param = params.get('persistentId')
+ id_param = params.get("persistentId")
if not id_param:
return False
platform_id = id_param[0]
@@ -138,8 +155,12 @@ class DataverseHelper(FilesetPlatformHelper):
return True
- def process_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
@@ -150,17 +171,17 @@ class DataverseHelper(FilesetPlatformHelper):
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
# 1. extract domain, PID, and version from URL
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
+ platform_domain = components.netloc.split(":")[0].lower()
params = urllib.parse.parse_qs(components.query)
- id_param = params.get('persistentId')
+ id_param = params.get("persistentId")
if not (id_param and id_param[0]):
raise PlatformScopeError("Expected a Dataverse persistentId in URL")
platform_id = id_param[0]
- version_param = params.get('version')
+ version_param = params.get("version")
dataset_version = None
if version_param:
dataset_version = version_param[0]
@@ -170,10 +191,11 @@ class DataverseHelper(FilesetPlatformHelper):
except ValueError:
raise PlatformScopeError("not actually in scope")
- if parsed_id['file_id']:
+ if parsed_id["file_id"]:
# TODO: maybe we could support this?
raise PlatformScopeError(
- "only entire dataverse datasets can be archived with this tool")
+ "only entire dataverse datasets can be archived with this tool"
+ )
# 1b. if we didn't get a version number from URL, fetch it from API
if not dataset_version:
@@ -182,8 +204,10 @@ class DataverseHelper(FilesetPlatformHelper):
)
resp.raise_for_status()
obj = resp.json()
- obj_latest = obj['data']['latestVersion']
- dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ obj_latest = obj["data"]["latestVersion"]
+ dataset_version = (
+ f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
# 2. API fetch
resp = self.session.get(
@@ -192,69 +216,72 @@ class DataverseHelper(FilesetPlatformHelper):
resp.raise_for_status()
obj = resp.json()
- obj_latest = obj['data']['latestVersion']
- assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
- assert platform_id == obj_latest['datasetPersistentId']
+ obj_latest = obj["data"]["latestVersion"]
+ assert (
+ dataset_version
+ == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+ assert platform_id == obj_latest["datasetPersistentId"]
manifest = []
- for row in obj_latest['files']:
- df = row['dataFile']
- df_persistent_id = df['persistentId']
+ for row in obj_latest["files"]:
+ df = row["dataFile"]
+ df_persistent_id = df["persistentId"]
platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}"
- if df.get('originalFileName'):
- platform_url += '&format=original'
+ if df.get("originalFileName"):
+ platform_url += "&format=original"
extra = dict()
# TODO: always save the version field?
- if row.get('version') != 1:
- extra['version'] = row['version']
- if 'description' in df:
- extra['description'] = df['description']
+ if row.get("version") != 1:
+ extra["version"] = row["version"]
+ if "description" in df:
+ extra["description"] = df["description"]
manifest.append(
FilesetManifestFile(
- path=df.get('originalFileName') or df['filename'],
- size=df.get('originalFileSize') or df['filesize'],
- md5=df['md5'],
+ path=df.get("originalFileName") or df["filename"],
+ size=df.get("originalFileSize") or df["filesize"],
+ md5=df["md5"],
# NOTE: don't get: sha1, sha256
- mimetype=df['contentType'],
+ mimetype=df["contentType"],
platform_url=platform_url,
extra=extra or None,
- ))
+ )
+ )
- platform_sub_id = platform_id.split('/')[-1]
+ platform_sub_id = platform_id.split("/")[-1]
archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
archiveorg_item_meta = dict(
# TODO: collection=platform_domain,
collection="datasets",
- date=obj_latest['releaseTime'].split('T')[0],
- source=
- f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+ date=obj_latest["releaseTime"].split("T")[0],
+ source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
)
- if platform_id.startswith('doi:10.'):
- archiveorg_item_meta['doi'] = platform_id.replace('doi:', '')
- for block in obj_latest['metadataBlocks']['citation']['fields']:
- if block['typeName'] == 'title':
- archiveorg_item_meta['title'] = block['value']
- elif block['typeName'] == 'depositor':
- archiveorg_item_meta['creator'] = block['value']
- elif block['typeName'] == 'dsDescription':
- archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue'][
- 'value']
-
- archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '')
- if obj_latest.get('termsOfUse'):
- archiveorg_item_meta['description'] += '\n<br>\n' + obj_latest['termsOfUse']
+ if platform_id.startswith("doi:10."):
+ archiveorg_item_meta["doi"] = platform_id.replace("doi:", "")
+ for block in obj_latest["metadataBlocks"]["citation"]["fields"]:
+ if block["typeName"] == "title":
+ archiveorg_item_meta["title"] = block["value"]
+ elif block["typeName"] == "depositor":
+ archiveorg_item_meta["creator"] = block["value"]
+ elif block["typeName"] == "dsDescription":
+ archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][
+ "value"
+ ]
+
+ archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "")
+ if obj_latest.get("termsOfUse"):
+ archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"]
return FilesetPlatformItem(
platform_name=self.platform_name,
- platform_status='success',
+ platform_status="success",
manifest=manifest,
platform_domain=platform_domain,
platform_id=platform_id,
archiveorg_item_name=archiveorg_item_name,
archiveorg_item_meta=archiveorg_item_meta,
- web_bundle_url=
- f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+ web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
# TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
extra=dict(version=dataset_version),
)
@@ -301,7 +328,7 @@ def test_parse_dataverse_persistentid() -> None:
}
invalid = [
- #"doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ # "doi:10.5072/FK2/J8SJZB/LL6WXZ",
"doi:10.25625/abcd",
"other:10.25625/LL6WXZ",
"10.25625/LL6WXZ",
@@ -322,7 +349,7 @@ def test_parse_dataverse_persistentid() -> None:
class FigshareHelper(FilesetPlatformHelper):
def __init__(self):
super().__init__()
- self.platform_name = 'figshare'
+ self.platform_name = "figshare"
self.session = requests.Session()
@staticmethod
@@ -337,13 +364,13 @@ class FigshareHelper(FilesetPlatformHelper):
# eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
# /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
- comp = path.split('/')
- if len(comp) < 4 or comp[1] != 'articles':
+ comp = path.split("/")
+ if len(comp) < 4 or comp[1] != "articles":
raise ValueError(f"not a figshare URL: {path}")
comp = comp[2:]
if comp[0] in [
- 'dataset',
+ "dataset",
]:
comp = comp[1:]
@@ -354,19 +381,23 @@ class FigshareHelper(FilesetPlatformHelper):
else:
raise ValueError(f"couldn't find figshare identiier: {path}")
- def match_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> bool:
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
+ platform_domain = components.netloc.split(":")[0].lower()
# only work with full, versioned figshare.com URLs
- if 'figshare.com' not in platform_domain:
+ if "figshare.com" not in platform_domain:
return False
try:
@@ -380,8 +411,12 @@ class FigshareHelper(FilesetPlatformHelper):
return False
- def process_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
"""
@@ -389,15 +424,16 @@ class FigshareHelper(FilesetPlatformHelper):
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
# 1. extract domain, PID, and version from URL
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
+ platform_domain = components.netloc.split(":")[0].lower()
(platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
assert platform_id.isdigit(), f"expected numeric: {platform_id}"
- assert dataset_version and dataset_version.isdigit(
+ assert (
+ dataset_version and dataset_version.isdigit()
), f"expected numeric: {dataset_version}"
# 1b. if we didn't get a version number from URL, fetch it from API
@@ -405,59 +441,60 @@ class FigshareHelper(FilesetPlatformHelper):
# 2. API fetch
resp = self.session.get(
- f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}")
+ f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}"
+ )
resp.raise_for_status()
obj = resp.json()
# figshare_type = obj['defined_type_name']
- if not obj['is_public']:
- raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
- if obj['is_embargoed']:
+ if not obj["is_public"]:
+ raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}")
+ if obj["is_embargoed"]:
raise PlatformRestrictedError(
f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
)
manifest = []
- for row in obj['files']:
+ for row in obj["files"]:
manifest.append(
FilesetManifestFile(
- path=row['name'],
- size=row['size'],
- md5=row['computed_md5'],
+ path=row["name"],
+ size=row["size"],
+ md5=row["computed_md5"],
# NOTE: don't get: sha1, sha256, mimetype
- platform_url=row['download_url'],
- #extra=dict(),
- ))
- assert not row.get('is_link_only')
+ platform_url=row["download_url"],
+ # extra=dict(),
+ )
+ )
+ assert not row.get("is_link_only")
authors = []
- for author in obj['authors']:
- authors.append(author['full_name'])
+ for author in obj["authors"]:
+ authors.append(author["full_name"])
archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
archiveorg_item_meta = dict(
# TODO: collection=platform_domain,
collection="datasets",
creator=authors,
- doi=obj['doi'],
- title=obj['title'],
- date=obj['published_date'],
- source=obj['url_public_html'],
- description=obj['description'],
- license=obj['license']['url'],
- version=obj['version'],
+ doi=obj["doi"],
+ title=obj["title"],
+ date=obj["published_date"],
+ source=obj["url_public_html"],
+ description=obj["description"],
+ license=obj["license"]["url"],
+ version=obj["version"],
)
return FilesetPlatformItem(
platform_name=self.platform_name,
- platform_status='success',
+ platform_status="success",
manifest=manifest,
platform_domain=platform_domain,
platform_id=platform_id,
archiveorg_item_name=archiveorg_item_name,
archiveorg_item_meta=archiveorg_item_meta,
- web_bundle_url=
- f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
# TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
extra=dict(version=dataset_version),
)
@@ -466,13 +503,19 @@ class FigshareHelper(FilesetPlatformHelper):
def test_parse_figshare_url_path() -> None:
valid = {
- "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1":
- ("8987858", "1"),
- "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858":
- ("8987858", None),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": (
+ "8987858",
+ "1",
+ ),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": (
+ "8987858",
+ None,
+ ),
"/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
- "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4":
- ("12127176", "4"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": (
+ "12127176",
+ "4",
+ ),
}
invalid = [
@@ -493,25 +536,33 @@ def test_parse_figshare_url_path() -> None:
class ZenodoHelper(FilesetPlatformHelper):
def __init__(self):
super().__init__()
- self.platform_name = 'zenodo'
+ self.platform_name = "zenodo"
self.session = requests.Session()
- def match_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> bool:
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
- if platform_domain == 'zenodo.org' and '/record/' in components.path:
+ platform_domain = components.netloc.split(":")[0].lower()
+ if platform_domain == "zenodo.org" and "/record/" in components.path:
return True
return False
- def process_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
"""
@@ -519,7 +570,7 @@ class ZenodoHelper(FilesetPlatformHelper):
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
# TODO: also look in base_url and resource-non-terminal for ident? to
# check for work-level redirects
@@ -527,118 +578,118 @@ class ZenodoHelper(FilesetPlatformHelper):
# 1. extract identifier from URL
# eg: https://zenodo.org/record/5230255
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
- if len(components.path.split('/')) < 2:
+ platform_domain = components.netloc.split(":")[0].lower()
+ if len(components.path.split("/")) < 2:
raise PlatformScopeError("Expected a complete, versioned figshare URL")
- platform_id = components.path.split('/')[2]
+ platform_id = components.path.split("/")[2]
assert platform_id.isdigit(), f"expected numeric: {platform_id}"
- if 'zenodo.org' not in platform_domain:
+ if "zenodo.org" not in platform_domain:
raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
# 2. API fetch
resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}")
if resp.status_code == 410:
- raise PlatformRestrictedError('record deleted')
+ raise PlatformRestrictedError("record deleted")
resp.raise_for_status()
obj = resp.json()
- assert obj['id'] == int(platform_id)
- work_id = obj['conceptrecid']
- if work_id == obj['id']:
+ assert obj["id"] == int(platform_id)
+ work_id = obj["conceptrecid"]
+ if work_id == obj["id"]:
raise PlatformScopeError(
- "got a work-level zenodo record, not a versioned record: {work_id}")
+ "got a work-level zenodo record, not a versioned record: {work_id}"
+ )
# zenodo_type = obj['metadata']['resource_type']['type']
- if obj['metadata']['access_right'] != 'open':
+ if obj["metadata"]["access_right"] != "open":
raise PlatformRestrictedError(
"not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
)
manifest = []
- for row in obj['files']:
+ for row in obj["files"]:
mf = FilesetManifestFile(
- path=row['key'],
- size=row['size'],
- platform_url=row['links']['self'],
- #extra=dict(),
+ path=row["key"],
+ size=row["size"],
+ platform_url=row["links"]["self"],
+ # extra=dict(),
)
- checksum = row['checksum']
+ checksum = row["checksum"]
# eg: md5:35ffcab905f8224556dba76648cb7dad
- if checksum.startswith('md5:'):
+ if checksum.startswith("md5:"):
mf.md5 = checksum[4:]
- elif checksum.startswith('sha1:'):
+ elif checksum.startswith("sha1:"):
mf.sha1 = checksum[45]
manifest.append(mf)
authors = []
- for author in obj['metadata']['creators']:
- authors.append(author['name'])
+ for author in obj["metadata"]["creators"]:
+ authors.append(author["name"])
archiveorg_item_name = f"{platform_domain}-{platform_id}"
archiveorg_item_meta = dict(
# TODO: collection=platform_domain,
collection="datasets",
creator=authors,
- doi=obj['doi'],
- title=obj['metadata']['title'],
- date=obj['metadata']['publication_date'],
- source=obj['links']['html'],
- description=obj['metadata']['description'],
- license=obj['metadata']['license']['id'],
- version=obj['revision'],
+ doi=obj["doi"],
+ title=obj["metadata"]["title"],
+ date=obj["metadata"]["publication_date"],
+ source=obj["links"]["html"],
+ description=obj["metadata"]["description"],
+ license=obj["metadata"]["license"]["id"],
+ version=obj["revision"],
# obj['metadata']['version'] is, eg, git version tag
)
return FilesetPlatformItem(
platform_name=self.platform_name,
- platform_status='success',
+ platform_status="success",
manifest=manifest,
platform_domain=platform_domain,
platform_id=platform_id,
archiveorg_item_name=archiveorg_item_name,
archiveorg_item_meta=archiveorg_item_meta,
- #web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
# TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
- extra=dict(version=obj['revision']),
+ extra=dict(version=obj["revision"]),
)
class ArchiveOrgHelper(FilesetPlatformHelper):
FORMAT_TO_MIMETYPE = {
- 'BZIP': 'application/x-bzip',
- 'BZIP2': 'application/x-bzip2',
- 'ZIP': 'application/zip',
- 'GZIP': 'application/gzip',
- 'RAR': 'application/vnd.rar',
- 'TAR': 'application/x-tar',
- '7z': 'application/x-7z-compressed',
- 'HTML': 'text/html',
- 'Text': 'text/plain',
- 'PDF': 'application/pdf',
- 'CSV': 'text/csv',
- 'XML': 'application/xml',
- 'JSON': 'application/json',
-
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
#'application/msword (.doc)', # .doc
#'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
#'application/vnd.ms-excel', # .xls
#'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
- 'MP3': 'audio/mpeg', # .mp3
- 'MP4': 'video/mp4', # .mp4
- 'MPEG': 'video/mpeg', # .mpeg
- 'JPEG': 'image/jpeg',
- 'GIF': 'image/gif',
- 'PNG': 'image/png',
- 'TIFF': 'image/tiff',
- 'Unknown': None,
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
}
def __init__(self):
super().__init__()
- self.platform_name = 'archiveorg'
+ self.platform_name = "archiveorg"
self.session = internetarchive.get_session()
@staticmethod
@@ -646,69 +697,79 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
"""
Filters IA API files
"""
- if f.source != 'original':
+ if f.source != "original":
return False
for suffix in [
- '_meta.sqlite',
- '_archive.torrent',
- '_itemimage.jpg',
- '_meta.xml',
- '_thumb.png',
- '_files.xml',
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
]:
if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
return False
- if f.name.startswith('_'):
+ if f.name.startswith("_"):
return False
- if item_name.startswith('academictorrents_'):
+ if item_name.startswith("academictorrents_"):
for suffix in [
- '_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib'
+ "_academictorrents.torrent",
+ "_academictorrents_torrent.txt",
+ ".bib",
]:
if f.name == item_name + suffix:
return False
return True
- def match_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> bool:
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
if resource and resource.terminal_url:
url = resource.terminal_url
else:
- url = request['base_url']
+ url = request["base_url"]
patterns = [
- '://archive.org/details/',
- '://archive.org/download/',
+ "://archive.org/details/",
+ "://archive.org/download/",
]
for p in patterns:
if p in url:
return True
return False
- def process_request(self, request: dict, resource: Optional[ResourceResult],
- html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
"""
- base_url_split = request['base_url'].split('/')
- #print(base_url_split, file=sys.stderr)
+ base_url_split = request["base_url"].split("/")
+ # print(base_url_split, file=sys.stderr)
assert len(base_url_split) in [5, 6]
- assert base_url_split[0] in ['http:', 'https:']
- assert base_url_split[2] == 'archive.org'
- assert base_url_split[3] in ['details', 'download']
+ assert base_url_split[0] in ["http:", "https:"]
+ assert base_url_split[2] == "archive.org"
+ assert base_url_split[3] in ["details", "download"]
item_name = base_url_split[4]
if len(base_url_split) == 6 and base_url_split[5]:
raise PlatformScopeError(
"got an archive.org file path, not download/details page; individual files not handled yet"
)
- #print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ # print(f" archiveorg processing item={item_name}", file=sys.stderr)
item = self.session.get_item(item_name)
item_name = item.identifier
- item_collection = item.metadata['collection']
+ item_collection = item.metadata["collection"]
if type(item_collection) == list:
item_collection = item_collection[0]
- assert item.metadata['mediatype'] not in ['collection', 'web']
+ assert item.metadata["mediatype"] not in ["collection", "web"]
item_files = item.get_files(on_the_fly=False)
item_files = [f for f in item_files if self.want_item_file(f, item_name)]
manifest = []
@@ -727,9 +788,9 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
return FilesetPlatformItem(
platform_name=self.platform_name,
- platform_status='success',
+ platform_status="success",
manifest=manifest,
- platform_domain='archive.org',
+ platform_domain="archive.org",
platform_id=item_name,
archiveorg_item_name=item_name,
archiveorg_meta=dict(collection=item_collection),