From 826c7538e091fac14d987a3cd654975da964e240 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 18:50:17 -0700 Subject: make fmt (black 21.9b0) --- python/sandcrawler/fileset_platforms.py | 463 ++++++++++++++++++-------------- 1 file changed, 262 insertions(+), 201 deletions(-) (limited to 'python/sandcrawler/fileset_platforms.py') diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 86e3ff2..fbe8066 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -4,25 +4,38 @@ from typing import Optional, Tuple import internetarchive import requests -from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy, - PlatformRestrictedError, PlatformScopeError) +from sandcrawler.fileset_types import ( + FilesetManifestFile, + FilesetPlatformItem, + IngestStrategy, + PlatformRestrictedError, + PlatformScopeError, +) from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult -class FilesetPlatformHelper(): +class FilesetPlatformHelper: def __init__(self): - self.platform_name = 'unknown' - - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + self.platform_name = "unknown" + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: """ Does this request look like it matches this platform? """ raise NotImplementedError() - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -47,7 +60,7 @@ class FilesetPlatformHelper(): class DataverseHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'dataverse' + self.platform_name = "dataverse" self.session = requests.Session() @staticmethod @@ -69,16 +82,16 @@ class DataverseHelper(FilesetPlatformHelper): If there is an error parsing, raises a ValueError """ id_type = None - if pid.startswith('doi:10.'): - id_type = 'doi' + if pid.startswith("doi:10."): + id_type = "doi" pid = pid[4:] - elif pid.startswith('hdl:'): - id_type = 'hdl' + elif pid.startswith("hdl:"): + id_type = "hdl" pid = pid[4:] else: raise ValueError(f"unknown dataverse persistentId format: {pid}") - comp = pid.split('/') + comp = pid.split("/") if len(comp) < 2: raise ValueError(f"unknown dataverse persistentId format: {pid}") @@ -114,19 +127,23 @@ class DataverseHelper(FilesetPlatformHelper): "file_id": file_id, } - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # TODO: could also do HTML platform detection or something? components = urllib.parse.urlparse(url) # platform_domain = components.netloc.split(':')[0].lower() params = urllib.parse.parse_qs(components.query) - id_param = params.get('persistentId') + id_param = params.get("persistentId") if not id_param: return False platform_id = id_param[0] @@ -138,8 +155,12 @@ class DataverseHelper(FilesetPlatformHelper): return True - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) @@ -150,17 +171,17 @@ class DataverseHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # 1. extract domain, PID, and version from URL components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() params = urllib.parse.parse_qs(components.query) - id_param = params.get('persistentId') + id_param = params.get("persistentId") if not (id_param and id_param[0]): raise PlatformScopeError("Expected a Dataverse persistentId in URL") platform_id = id_param[0] - version_param = params.get('version') + version_param = params.get("version") dataset_version = None if version_param: dataset_version = version_param[0] @@ -170,10 +191,11 @@ class DataverseHelper(FilesetPlatformHelper): except ValueError: raise PlatformScopeError("not actually in scope") - if parsed_id['file_id']: + if parsed_id["file_id"]: # TODO: maybe we could support this? raise PlatformScopeError( - "only entire dataverse datasets can be archived with this tool") + "only entire dataverse datasets can be archived with this tool" + ) # 1b. if we didn't get a version number from URL, fetch it from API if not dataset_version: @@ -182,8 +204,10 @@ class DataverseHelper(FilesetPlatformHelper): ) resp.raise_for_status() obj = resp.json() - obj_latest = obj['data']['latestVersion'] - dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + obj_latest = obj["data"]["latestVersion"] + dataset_version = ( + f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) # 2. API fetch resp = self.session.get( @@ -192,69 +216,72 @@ class DataverseHelper(FilesetPlatformHelper): resp.raise_for_status() obj = resp.json() - obj_latest = obj['data']['latestVersion'] - assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" - assert platform_id == obj_latest['datasetPersistentId'] + obj_latest = obj["data"]["latestVersion"] + assert ( + dataset_version + == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) + assert platform_id == obj_latest["datasetPersistentId"] manifest = [] - for row in obj_latest['files']: - df = row['dataFile'] - df_persistent_id = df['persistentId'] + for row in obj_latest["files"]: + df = row["dataFile"] + df_persistent_id = df["persistentId"] platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}" - if df.get('originalFileName'): - platform_url += '&format=original' + if df.get("originalFileName"): + platform_url += "&format=original" extra = dict() # TODO: always save the version field? - if row.get('version') != 1: - extra['version'] = row['version'] - if 'description' in df: - extra['description'] = df['description'] + if row.get("version") != 1: + extra["version"] = row["version"] + if "description" in df: + extra["description"] = df["description"] manifest.append( FilesetManifestFile( - path=df.get('originalFileName') or df['filename'], - size=df.get('originalFileSize') or df['filesize'], - md5=df['md5'], + path=df.get("originalFileName") or df["filename"], + size=df.get("originalFileSize") or df["filesize"], + md5=df["md5"], # NOTE: don't get: sha1, sha256 - mimetype=df['contentType'], + mimetype=df["contentType"], platform_url=platform_url, extra=extra or None, - )) + ) + ) - platform_sub_id = platform_id.split('/')[-1] + platform_sub_id = platform_id.split("/")[-1] archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", - date=obj_latest['releaseTime'].split('T')[0], - source= - f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}", + date=obj_latest["releaseTime"].split("T")[0], + source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}", ) - if platform_id.startswith('doi:10.'): - archiveorg_item_meta['doi'] = platform_id.replace('doi:', '') - for block in obj_latest['metadataBlocks']['citation']['fields']: - if block['typeName'] == 'title': - archiveorg_item_meta['title'] = block['value'] - elif block['typeName'] == 'depositor': - archiveorg_item_meta['creator'] = block['value'] - elif block['typeName'] == 'dsDescription': - archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue'][ - 'value'] - - archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '') - if obj_latest.get('termsOfUse'): - archiveorg_item_meta['description'] += '\n
\n' + obj_latest['termsOfUse'] + if platform_id.startswith("doi:10."): + archiveorg_item_meta["doi"] = platform_id.replace("doi:", "") + for block in obj_latest["metadataBlocks"]["citation"]["fields"]: + if block["typeName"] == "title": + archiveorg_item_meta["title"] = block["value"] + elif block["typeName"] == "depositor": + archiveorg_item_meta["creator"] = block["value"] + elif block["typeName"] == "dsDescription": + archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][ + "value" + ] + + archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "") + if obj_latest.get("termsOfUse"): + archiveorg_item_meta["description"] += "\n
\n" + obj_latest["termsOfUse"] return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - web_bundle_url= - f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original", + web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) extra=dict(version=dataset_version), ) @@ -301,7 +328,7 @@ def test_parse_dataverse_persistentid() -> None: } invalid = [ - #"doi:10.5072/FK2/J8SJZB/LL6WXZ", + # "doi:10.5072/FK2/J8SJZB/LL6WXZ", "doi:10.25625/abcd", "other:10.25625/LL6WXZ", "10.25625/LL6WXZ", @@ -322,7 +349,7 @@ def test_parse_dataverse_persistentid() -> None: class FigshareHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'figshare' + self.platform_name = "figshare" self.session = requests.Session() @staticmethod @@ -337,13 +364,13 @@ class FigshareHelper(FilesetPlatformHelper): # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1 # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4 - comp = path.split('/') - if len(comp) < 4 or comp[1] != 'articles': + comp = path.split("/") + if len(comp) < 4 or comp[1] != "articles": raise ValueError(f"not a figshare URL: {path}") comp = comp[2:] if comp[0] in [ - 'dataset', + "dataset", ]: comp = comp[1:] @@ -354,19 +381,23 @@ class FigshareHelper(FilesetPlatformHelper): else: raise ValueError(f"couldn't find figshare identiier: {path}") - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() # only work with full, versioned figshare.com URLs - if 'figshare.com' not in platform_domain: + if "figshare.com" not in platform_domain: return False try: @@ -380,8 +411,12 @@ class FigshareHelper(FilesetPlatformHelper): return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -389,15 +424,16 @@ class FigshareHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # 1. extract domain, PID, and version from URL components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() (platform_id, dataset_version) = self.parse_figshare_url_path(components.path) assert platform_id.isdigit(), f"expected numeric: {platform_id}" - assert dataset_version and dataset_version.isdigit( + assert ( + dataset_version and dataset_version.isdigit() ), f"expected numeric: {dataset_version}" # 1b. if we didn't get a version number from URL, fetch it from API @@ -405,59 +441,60 @@ class FigshareHelper(FilesetPlatformHelper): # 2. API fetch resp = self.session.get( - f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}") + f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}" + ) resp.raise_for_status() obj = resp.json() # figshare_type = obj['defined_type_name'] - if not obj['is_public']: - raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}') - if obj['is_embargoed']: + if not obj["is_public"]: + raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}") + if obj["is_embargoed"]: raise PlatformRestrictedError( f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})' ) manifest = [] - for row in obj['files']: + for row in obj["files"]: manifest.append( FilesetManifestFile( - path=row['name'], - size=row['size'], - md5=row['computed_md5'], + path=row["name"], + size=row["size"], + md5=row["computed_md5"], # NOTE: don't get: sha1, sha256, mimetype - platform_url=row['download_url'], - #extra=dict(), - )) - assert not row.get('is_link_only') + platform_url=row["download_url"], + # extra=dict(), + ) + ) + assert not row.get("is_link_only") authors = [] - for author in obj['authors']: - authors.append(author['full_name']) + for author in obj["authors"]: + authors.append(author["full_name"]) archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", creator=authors, - doi=obj['doi'], - title=obj['title'], - date=obj['published_date'], - source=obj['url_public_html'], - description=obj['description'], - license=obj['license']['url'], - version=obj['version'], + doi=obj["doi"], + title=obj["title"], + date=obj["published_date"], + source=obj["url_public_html"], + description=obj["description"], + license=obj["license"]["url"], + version=obj["version"], ) return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - web_bundle_url= - f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) extra=dict(version=dataset_version), ) @@ -466,13 +503,19 @@ class FigshareHelper(FilesetPlatformHelper): def test_parse_figshare_url_path() -> None: valid = { - "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": - ("8987858", "1"), - "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": - ("8987858", None), + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ( + "8987858", + "1", + ), + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ( + "8987858", + None, + ), "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"), - "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": - ("12127176", "4"), + "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ( + "12127176", + "4", + ), } invalid = [ @@ -493,25 +536,33 @@ def test_parse_figshare_url_path() -> None: class ZenodoHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'zenodo' + self.platform_name = "zenodo" self.session = requests.Session() - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() - if platform_domain == 'zenodo.org' and '/record/' in components.path: + platform_domain = components.netloc.split(":")[0].lower() + if platform_domain == "zenodo.org" and "/record/" in components.path: return True return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -519,7 +570,7 @@ class ZenodoHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # TODO: also look in base_url and resource-non-terminal for ident? to # check for work-level redirects @@ -527,118 +578,118 @@ class ZenodoHelper(FilesetPlatformHelper): # 1. extract identifier from URL # eg: https://zenodo.org/record/5230255 components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() - if len(components.path.split('/')) < 2: + platform_domain = components.netloc.split(":")[0].lower() + if len(components.path.split("/")) < 2: raise PlatformScopeError("Expected a complete, versioned figshare URL") - platform_id = components.path.split('/')[2] + platform_id = components.path.split("/")[2] assert platform_id.isdigit(), f"expected numeric: {platform_id}" - if 'zenodo.org' not in platform_domain: + if "zenodo.org" not in platform_domain: raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}") # 2. API fetch resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}") if resp.status_code == 410: - raise PlatformRestrictedError('record deleted') + raise PlatformRestrictedError("record deleted") resp.raise_for_status() obj = resp.json() - assert obj['id'] == int(platform_id) - work_id = obj['conceptrecid'] - if work_id == obj['id']: + assert obj["id"] == int(platform_id) + work_id = obj["conceptrecid"] + if work_id == obj["id"]: raise PlatformScopeError( - "got a work-level zenodo record, not a versioned record: {work_id}") + "got a work-level zenodo record, not a versioned record: {work_id}" + ) # zenodo_type = obj['metadata']['resource_type']['type'] - if obj['metadata']['access_right'] != 'open': + if obj["metadata"]["access_right"] != "open": raise PlatformRestrictedError( "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}" ) manifest = [] - for row in obj['files']: + for row in obj["files"]: mf = FilesetManifestFile( - path=row['key'], - size=row['size'], - platform_url=row['links']['self'], - #extra=dict(), + path=row["key"], + size=row["size"], + platform_url=row["links"]["self"], + # extra=dict(), ) - checksum = row['checksum'] + checksum = row["checksum"] # eg: md5:35ffcab905f8224556dba76648cb7dad - if checksum.startswith('md5:'): + if checksum.startswith("md5:"): mf.md5 = checksum[4:] - elif checksum.startswith('sha1:'): + elif checksum.startswith("sha1:"): mf.sha1 = checksum[45] manifest.append(mf) authors = [] - for author in obj['metadata']['creators']: - authors.append(author['name']) + for author in obj["metadata"]["creators"]: + authors.append(author["name"]) archiveorg_item_name = f"{platform_domain}-{platform_id}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", creator=authors, - doi=obj['doi'], - title=obj['metadata']['title'], - date=obj['metadata']['publication_date'], - source=obj['links']['html'], - description=obj['metadata']['description'], - license=obj['metadata']['license']['id'], - version=obj['revision'], + doi=obj["doi"], + title=obj["metadata"]["title"], + date=obj["metadata"]["publication_date"], + source=obj["links"]["html"], + description=obj["metadata"]["description"], + license=obj["metadata"]["license"]["id"], + version=obj["revision"], # obj['metadata']['version'] is, eg, git version tag ) return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - #web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) - extra=dict(version=obj['revision']), + extra=dict(version=obj["revision"]), ) class ArchiveOrgHelper(FilesetPlatformHelper): FORMAT_TO_MIMETYPE = { - 'BZIP': 'application/x-bzip', - 'BZIP2': 'application/x-bzip2', - 'ZIP': 'application/zip', - 'GZIP': 'application/gzip', - 'RAR': 'application/vnd.rar', - 'TAR': 'application/x-tar', - '7z': 'application/x-7z-compressed', - 'HTML': 'text/html', - 'Text': 'text/plain', - 'PDF': 'application/pdf', - 'CSV': 'text/csv', - 'XML': 'application/xml', - 'JSON': 'application/json', - + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", #'application/msword (.doc)', # .doc #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx #'application/vnd.ms-excel', # .xls #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx - 'MP3': 'audio/mpeg', # .mp3 - 'MP4': 'video/mp4', # .mp4 - 'MPEG': 'video/mpeg', # .mpeg - 'JPEG': 'image/jpeg', - 'GIF': 'image/gif', - 'PNG': 'image/png', - 'TIFF': 'image/tiff', - 'Unknown': None, + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, } def __init__(self): super().__init__() - self.platform_name = 'archiveorg' + self.platform_name = "archiveorg" self.session = internetarchive.get_session() @staticmethod @@ -646,69 +697,79 @@ class ArchiveOrgHelper(FilesetPlatformHelper): """ Filters IA API files """ - if f.source != 'original': + if f.source != "original": return False for suffix in [ - '_meta.sqlite', - '_archive.torrent', - '_itemimage.jpg', - '_meta.xml', - '_thumb.png', - '_files.xml', + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", ]: if f.name == item_name + suffix or f.name == item_name.lower() + suffix: return False - if f.name.startswith('_'): + if f.name.startswith("_"): return False - if item_name.startswith('academictorrents_'): + if item_name.startswith("academictorrents_"): for suffix in [ - '_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib' + "_academictorrents.torrent", + "_academictorrents_torrent.txt", + ".bib", ]: if f.name == item_name + suffix: return False return True - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] patterns = [ - '://archive.org/details/', - '://archive.org/download/', + "://archive.org/details/", + "://archive.org/download/", ] for p in patterns: if p in url: return True return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ - base_url_split = request['base_url'].split('/') - #print(base_url_split, file=sys.stderr) + base_url_split = request["base_url"].split("/") + # print(base_url_split, file=sys.stderr) assert len(base_url_split) in [5, 6] - assert base_url_split[0] in ['http:', 'https:'] - assert base_url_split[2] == 'archive.org' - assert base_url_split[3] in ['details', 'download'] + assert base_url_split[0] in ["http:", "https:"] + assert base_url_split[2] == "archive.org" + assert base_url_split[3] in ["details", "download"] item_name = base_url_split[4] if len(base_url_split) == 6 and base_url_split[5]: raise PlatformScopeError( "got an archive.org file path, not download/details page; individual files not handled yet" ) - #print(f" archiveorg processing item={item_name}", file=sys.stderr) + # print(f" archiveorg processing item={item_name}", file=sys.stderr) item = self.session.get_item(item_name) item_name = item.identifier - item_collection = item.metadata['collection'] + item_collection = item.metadata["collection"] if type(item_collection) == list: item_collection = item_collection[0] - assert item.metadata['mediatype'] not in ['collection', 'web'] + assert item.metadata["mediatype"] not in ["collection", "web"] item_files = item.get_files(on_the_fly=False) item_files = [f for f in item_files if self.want_item_file(f, item_name)] manifest = [] @@ -727,9 +788,9 @@ class ArchiveOrgHelper(FilesetPlatformHelper): return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, - platform_domain='archive.org', + platform_domain="archive.org", platform_id=item_name, archiveorg_item_name=item_name, archiveorg_meta=dict(collection=item_collection), -- cgit v1.2.3