diff options
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 832 |
1 files changed, 832 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py new file mode 100644 index 0000000..5c13318 --- /dev/null +++ b/python/sandcrawler/fileset_platforms.py @@ -0,0 +1,832 @@ +import urllib.parse +from typing import Optional, Tuple + +import internetarchive + +from sandcrawler.fileset_types import ( + FilesetManifestFile, + FilesetPlatformItem, + IngestStrategy, + PlatformRestrictedError, + PlatformScopeError, +) +from sandcrawler.html_metadata import BiblioMetadata +from sandcrawler.ia import ResourceResult +from sandcrawler.misc import requests_retry_session + + +class FilesetPlatformHelper: + def __init__(self): + self.platform_name = "unknown" + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: + """ + Does this request look like it matches this platform? + """ + raise NotImplementedError() + + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + raise NotImplementedError() + + def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy: + assert item.manifest + total_size = sum([m.size or 0 for m in item.manifest]) or 0 + largest_size = max([m.size or 0 for m in item.manifest]) or 0 + if len(item.manifest) == 1: + if total_size < 64 * 1024 * 1024: + return IngestStrategy.WebFile + else: + return IngestStrategy.ArchiveorgFile + else: + if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024: + return IngestStrategy.WebFileset + else: + return IngestStrategy.ArchiveorgFileset + + +class DataverseHelper(FilesetPlatformHelper): + def __init__(self): + super().__init__() + self.platform_name = "dataverse" + self.session = requests_retry_session() + + @staticmethod + def parse_dataverse_persistentid(pid: str) -> dict: + """ + Parses a persistentId into 5 sections: + + - type (doi or hdl) + - authority (eg, DOI prefix) + - shoulder (optional, eg 'DVN') + - dataset_id (6-digit) + - file_id + + The returned dict always has all components, which may be 'None' if optional. + + This is possible because the dataverse software only supports a handful + of configurations and persistend identifier types. + + If there is an error parsing, raises a ValueError + """ + id_type = None + if pid.startswith("doi:10."): + id_type = "doi" + pid = pid[4:] + elif pid.startswith("hdl:"): + id_type = "hdl" + pid = pid[4:] + else: + raise ValueError(f"unknown dataverse persistentId format: {pid}") + + comp = pid.split("/") + if len(comp) < 2: + raise ValueError(f"unknown dataverse persistentId format: {pid}") + + authority = comp[0] + shoulder = None + dataset_id = None + file_id = None + if len(comp[1]) != 6 and len(comp) == 3: + shoulder = comp[1] + dataset_id = comp[2] + elif len(comp[1]) != 6 and len(comp) == 4: + shoulder = comp[1] + dataset_id = comp[2] + file_id = comp[3] + elif len(comp[1]) == 6 and len(comp) == 2: + dataset_id = comp[1] + elif len(comp[1]) == 6 and len(comp) == 3: + dataset_id = comp[1] + file_id = comp[2] + else: + raise ValueError(f"unknown dataverse persistentId format: {pid}") + + if len(dataset_id) != 6: + raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}") + if file_id and len(file_id) != 6: + raise ValueError(f"expected a 6-digit dataverse file id: {file_id}") + + return { + "type": id_type, + "authority": authority, + "shoulder": shoulder, + "dataset_id": dataset_id, + "file_id": file_id, + } + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + # TODO: could also do HTML platform detection or something? + + components = urllib.parse.urlparse(url) + # platform_domain = components.netloc.split(':')[0].lower() + params = urllib.parse.parse_qs(components.query) + id_param = params.get("persistentId") + if not id_param: + return False + platform_id = id_param[0] + + try: + self.parse_dataverse_persistentid(platform_id) + except ValueError: + return False + + return True + + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + + + HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB + """ + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + # 1. extract domain, PID, and version from URL + components = urllib.parse.urlparse(url) + platform_domain = components.netloc.split(":")[0].lower() + params = urllib.parse.parse_qs(components.query) + id_param = params.get("persistentId") + if not (id_param and id_param[0]): + raise PlatformScopeError("Expected a Dataverse persistentId in URL") + platform_id = id_param[0] + version_param = params.get("version") + dataset_version = None + if version_param: + dataset_version = version_param[0] + + try: + parsed_id = self.parse_dataverse_persistentid(platform_id) + except ValueError: + raise PlatformScopeError("not actually in scope") + + if parsed_id["file_id"]: + # TODO: maybe we could support this? + raise PlatformScopeError( + "only entire dataverse datasets can be archived with this tool" + ) + + # 1b. if we didn't get a version number from URL, fetch it from API + if not dataset_version: + resp = self.session.get( + f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}", + timeout=60.0, + ) + resp.raise_for_status() + obj = resp.json() + if "latestVersion" not in obj["data"]: + raise PlatformScopeError("could not find latest version for dataverse record") + obj_latest = obj["data"]["latestVersion"] + dataset_version = ( + f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) + + # 2. API fetch + resp = self.session.get( + f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}", + timeout=60.0, + ) + resp.raise_for_status() + obj = resp.json() + + obj_latest = obj["data"]["latestVersion"] + assert ( + dataset_version + == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) + assert platform_id == obj_latest["datasetPersistentId"] + + manifest = [] + for row in obj_latest["files"]: + df = row["dataFile"] + df_persistent_id = df["persistentId"] + platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}" + if df.get("originalFileName"): + platform_url += "&format=original" + + extra = dict() + # TODO: always save the version field? + if row.get("version") != 1: + extra["version"] = row["version"] + if "description" in df: + extra["description"] = df["description"] + manifest.append( + FilesetManifestFile( + path=df.get("originalFileName") or df["filename"], + size=df.get("originalFileSize") or df["filesize"], + md5=df["md5"], + # NOTE: don't get: sha1, sha256 + mimetype=df["contentType"], + platform_url=platform_url, + extra=extra or None, + ) + ) + + platform_sub_id = platform_id.split("/")[-1] + archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}" + archiveorg_item_meta = dict( + # TODO: collection=platform_domain, + collection="datasets", + date=obj_latest["releaseTime"].split("T")[0], + source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}", + ) + if platform_id.startswith("doi:10."): + archiveorg_item_meta["doi"] = platform_id.replace("doi:", "") + for block in obj_latest["metadataBlocks"]["citation"]["fields"]: + if block["typeName"] == "title": + archiveorg_item_meta["title"] = block["value"] + elif block["typeName"] == "depositor": + archiveorg_item_meta["creator"] = block["value"] + elif block["typeName"] == "dsDescription": + archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][ + "value" + ] + + archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "") + if obj_latest.get("termsOfUse"): + archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"] + + return FilesetPlatformItem( + platform_name=self.platform_name, + platform_status="success", + manifest=manifest, + platform_domain=platform_domain, + platform_id=platform_id, + archiveorg_item_name=archiveorg_item_name, + archiveorg_item_meta=archiveorg_item_meta, + web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original", + # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) + extra=dict(version=dataset_version), + ) + + +def test_parse_dataverse_persistentid() -> None: + + valid = { + "doi:10.25625/LL6WXZ": { + "type": "doi", + "authority": "10.25625", + "shoulder": None, + "dataset_id": "LL6WXZ", + "file_id": None, + }, + "doi:10.5072/FK2/J8SJZB": { + "type": "doi", + "authority": "10.5072", + "shoulder": "FK2", + "dataset_id": "J8SJZB", + "file_id": None, + }, + "doi:10.5072/FK2/J8SJZB/LL6WXZ": { + "type": "doi", + "authority": "10.5072", + "shoulder": "FK2", + "dataset_id": "J8SJZB", + "file_id": "LL6WXZ", + }, + "hdl:20.500.12690/RIN/IDDOAH/BTNH25": { + "type": "hdl", + "authority": "20.500.12690", + "shoulder": "RIN", + "dataset_id": "IDDOAH", + "file_id": "BTNH25", + }, + "doi:10.7910/DVN/6HPRIG": { + "type": "doi", + "authority": "10.7910", + "shoulder": "DVN", + "dataset_id": "6HPRIG", + "file_id": None, + }, + } + + invalid = [ + # "doi:10.5072/FK2/J8SJZB/LL6WXZ", + "doi:10.25625/abcd", + "other:10.25625/LL6WXZ", + "10.25625/LL6WXZ", + "doi:10.5072/FK2/J8SJZB/LL6WXZv123", + ] + + for pid, val in valid.items(): + assert DataverseHelper.parse_dataverse_persistentid(pid) == val + + for pid in invalid: + try: + DataverseHelper.parse_dataverse_persistentid(pid) + assert False, "should not get here" + except ValueError: + pass + + +class FigshareHelper(FilesetPlatformHelper): + def __init__(self): + super().__init__() + self.platform_name = "figshare" + self.session = requests_retry_session() + + @staticmethod + def parse_figshare_url_path(path: str) -> Tuple[str, Optional[str]]: + """ + Tries to parse a figshare URL into ID number and (optional) version number. + + Returns a two-element tuple; version number will be None if not found + + Raises a ValueError if not a figshare URL + """ + # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1 + # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4 + + comp = path.split("/") + if len(comp) < 4 or comp[1] != "articles": + raise ValueError(f"not a figshare URL: {path}") + + comp = comp[2:] + if comp[0] in [ + "dataset", + # TODO: should the following be considered "out of scope"? + "journal_contribution", + "presentation", + "poster", + "thesis", + ]: + comp = comp[1:] + + if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit(): + return (comp[1], comp[2]) + elif len(comp) == 2 and comp[1].isdigit(): + return (comp[1], None) + else: + raise ValueError(f"couldn't find figshare identiier: {path}") + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + components = urllib.parse.urlparse(url) + platform_domain = components.netloc.split(":")[0].lower() + + # only work with full, versioned figshare.com URLs + if "figshare.com" not in platform_domain: + return False + + try: + parsed = self.parse_figshare_url_path(components.path) + except ValueError: + return False + + # has file component + if parsed[0] and parsed[1]: + return True + + return False + + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + # 1. extract domain, PID, and version from URL + components = urllib.parse.urlparse(url) + platform_domain = components.netloc.split(":")[0].lower() + + (platform_id, dataset_version) = self.parse_figshare_url_path(components.path) + assert platform_id.isdigit(), f"expected numeric: {platform_id}" + assert ( + dataset_version and dataset_version.isdigit() + ), f"expected numeric: {dataset_version}" + + # 1b. if we didn't get a version number from URL, fetch it from API + # TODO: implement this code path + + # 2. API fetch + resp = self.session.get( + f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}", + timeout=60.0, + ) + resp.raise_for_status() + obj = resp.json() + + # figshare_type = obj['defined_type_name'] + + if not obj["is_public"]: + raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}") + if obj["is_embargoed"]: + raise PlatformRestrictedError( + f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})' + ) + + manifest = [] + for row in obj["files"]: + manifest.append( + FilesetManifestFile( + path=row["name"], + size=row["size"], + md5=row["computed_md5"], + # NOTE: don't get: sha1, sha256, mimetype + platform_url=row["download_url"], + # extra=dict(), + ) + ) + if row.get("is_link_only"): + raise PlatformScopeError( + f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}" + ) + + authors = [] + for author in obj["authors"]: + authors.append(author["full_name"]) + archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}" + archiveorg_item_meta = dict( + # TODO: collection=platform_domain, + collection="datasets", + creator=authors, + doi=obj["doi"], + title=obj["title"], + date=obj["published_date"], + source=obj["url_public_html"], + description=obj["description"], + license=obj["license"]["url"], + version=obj["version"], + ) + + return FilesetPlatformItem( + platform_name=self.platform_name, + platform_status="success", + manifest=manifest, + platform_domain=platform_domain, + platform_id=platform_id, + archiveorg_item_name=archiveorg_item_name, + archiveorg_item_meta=archiveorg_item_meta, + web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) + extra=dict(version=dataset_version), + ) + + +def test_parse_figshare_url_path() -> None: + + valid = { + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ( + "8987858", + "1", + ), + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ( + "8987858", + None, + ), + "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"), + "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ( + "12127176", + "4", + ), + "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": ( + "1407386", + "3", + ), + "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": ( + "1094056", + "1", + ), + } + + invalid = [ + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species", + ] + + for path, val in valid.items(): + assert FigshareHelper.parse_figshare_url_path(path) == val + + for path in invalid: + try: + FigshareHelper.parse_figshare_url_path(path) + assert False, "should not get here" + except ValueError: + pass + + +class ZenodoHelper(FilesetPlatformHelper): + def __init__(self): + super().__init__() + self.platform_name = "zenodo" + self.session = requests_retry_session() + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + components = urllib.parse.urlparse(url) + platform_domain = components.netloc.split(":")[0].lower() + if platform_domain == "zenodo.org" and "/record/" in components.path: + return True + return False + + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + + # TODO: also look in base_url and resource-non-terminal for ident? to + # check for work-level redirects + + # 1. extract identifier from URL + # eg: https://zenodo.org/record/5230255 + components = urllib.parse.urlparse(url) + platform_domain = components.netloc.split(":")[0].lower() + if len(components.path.split("/")) < 2: + raise PlatformScopeError("Expected a complete, versioned figshare URL") + + platform_id = components.path.split("/")[2] + assert platform_id.isdigit(), f"expected numeric: {platform_id}" + + if "zenodo.org" not in platform_domain: + raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}") + + # 2. API fetch + resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}", timeout=60.0) + if resp.status_code == 410: + raise PlatformRestrictedError("record deleted") + resp.raise_for_status() + obj = resp.json() + + assert obj["id"] == int(platform_id) + work_id = obj["conceptrecid"] + if work_id == obj["id"]: + raise PlatformScopeError( + "got a work-level zenodo record, not a versioned record: {work_id}" + ) + + # zenodo_type = obj['metadata']['resource_type']['type'] + + if obj["metadata"]["access_right"] != "open": + raise PlatformRestrictedError( + "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}" + ) + + manifest = [] + for row in obj["files"]: + mf = FilesetManifestFile( + path=row["key"], + size=row["size"], + platform_url=row["links"]["self"], + # extra=dict(), + ) + checksum = row["checksum"] + # eg: md5:35ffcab905f8224556dba76648cb7dad + if checksum.startswith("md5:"): + mf.md5 = checksum[4:] + elif checksum.startswith("sha1:"): + mf.sha1 = checksum[45] + manifest.append(mf) + + authors = [] + for author in obj["metadata"]["creators"]: + authors.append(author["name"]) + archiveorg_item_name = f"{platform_domain}-{platform_id}" + archiveorg_item_meta = dict( + # TODO: collection=platform_domain, + collection="datasets", + creator=authors, + doi=obj["doi"], + title=obj["metadata"]["title"], + date=obj["metadata"]["publication_date"], + source=obj["links"]["html"], + description=obj["metadata"]["description"], + license=obj["metadata"]["license"]["id"], + version=obj["revision"], + # obj['metadata']['version'] is, eg, git version tag + ) + + return FilesetPlatformItem( + platform_name=self.platform_name, + platform_status="success", + manifest=manifest, + platform_domain=platform_domain, + platform_id=platform_id, + archiveorg_item_name=archiveorg_item_name, + archiveorg_item_meta=archiveorg_item_meta, + # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) + extra=dict(version=obj["revision"]), + ) + + +class ArchiveOrgHelper(FilesetPlatformHelper): + + FORMAT_TO_MIMETYPE = { + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", + #'application/msword (.doc)', # .doc + #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx + #'application/vnd.ms-excel', # .xls + #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, + } + + def __init__(self): + super().__init__() + self.platform_name = "archiveorg" + self.session = internetarchive.get_session() + + @staticmethod + def want_item_file(f: internetarchive.File, item_name: str) -> bool: + """ + Filters IA API files + """ + if f.source != "original": + return False + for suffix in [ + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", + ]: + if f.name == item_name + suffix or f.name == item_name.lower() + suffix: + return False + if f.name.startswith("_"): + return False + if item_name.startswith("academictorrents_"): + for suffix in [ + "_academictorrents.torrent", + "_academictorrents_torrent.txt", + ".bib", + ]: + if f.name == item_name + suffix: + return False + return True + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request["base_url"] + patterns = [ + "://archive.org/details/", + "://archive.org/download/", + ] + for p in patterns: + if p in url: + return True + return False + + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + + base_url_split = request["base_url"].split("/") + # print(base_url_split, file=sys.stderr) + assert len(base_url_split) in [5, 6] + assert base_url_split[0] in ["http:", "https:"] + assert base_url_split[2] == "archive.org" + assert base_url_split[3] in ["details", "download"] + item_name = base_url_split[4] + if len(base_url_split) == 6 and base_url_split[5]: + raise PlatformScopeError( + "got an archive.org file path, not download/details page; individual files not handled yet" + ) + + # print(f" archiveorg processing item={item_name}", file=sys.stderr) + item = self.session.get_item(item_name) + item_name = item.identifier + item_collection = item.metadata["collection"] + if type(item_collection) == list: + item_collection = item_collection[0] + assert item.metadata["mediatype"] not in ["collection", "web"] + item_files = item.get_files(on_the_fly=False) + item_files = [f for f in item_files if self.want_item_file(f, item_name)] + manifest = [] + for f in item_files: + assert f.name and f.sha1 and f.md5 + assert f.name is not None + mf = FilesetManifestFile( + path=f.name, + size=int(f.size), + sha1=f.sha1, + md5=f.md5, + mimetype=self.FORMAT_TO_MIMETYPE[f.format], + platform_url=f"https://archive.org/download/{item_name}/{f.name}", + ) + manifest.append(mf) + + return FilesetPlatformItem( + platform_name=self.platform_name, + platform_status="success", + manifest=manifest, + platform_domain="archive.org", + platform_id=item_name, + archiveorg_item_name=item_name, + archiveorg_meta=dict(collection=item_collection), + ) + + def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy: + """ + Don't use default strategy picker; we are always doing an 'existing' in this case. + """ + assert item.manifest is not None + if len(item.manifest) == 1: + # NOTE: code flow does not support ArchiveorgFilesetBundle for the + # case of, eg, a single zipfile in an archive.org item + return IngestStrategy.ArchiveorgFile + elif len(item.manifest) >= 1: + return IngestStrategy.ArchiveorgFileset + else: + raise NotImplementedError("empty dataset") |