#!/usr/bin/env python3 import hashlib import json import mimetypes import os import subprocess import sys import urllib import urllib.parse from typing import Any, Dict, List, Optional, Tuple import fatcat_openapi_client import magic from fatcat_openapi_client import ( ApiClient, Editgroup, FilesetEntity, FilesetFile, ReleaseAbstract, ReleaseContrib, ReleaseEntity, ReleaseExtIds, ) from .common import clean from .crossref import lookup_license_slug def single_file(prefix: str, path: str) -> FilesetFile: full = prefix + path size_bytes = os.stat(full).st_size hashes = [ hashlib.md5(), hashlib.sha1(), hashlib.sha256(), ] with open(full, "rb") as fp: while True: data = fp.read(2 ** 20) if not data: break for h in hashes: h.update(data) mime = magic.Magic(mime=True).from_file(full) if mime == "application/octet-stream": # magic apparently isn't that great; try using filename as well guess = mimetypes.guess_type(full)[0] if guess: mime = guess fsf = FilesetFile( path=path, size=size_bytes, md5=hashes[0].hexdigest(), sha1=hashes[1].hexdigest(), sha256=hashes[2].hexdigest(), extra=dict(mimetype=mime), ) return fsf def make_manifest(base_dir: str) -> List[FilesetFile]: manifest = [] for root, dirs, files in os.walk(base_dir): for f in files: manifest.append(single_file(root, f)) return manifest def cdl_dash_release( meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None ) -> ReleaseEntity: if not extra: extra = dict() assert meta["identifier"]["type"] == "DOI" doi = meta["identifier"]["value"].lower() assert doi.startswith("10.") ark_id = None for extid in meta.get("alternativeIdentifiers", []): if extid["value"].startswith("ark:"): ark_id = extid["value"] assert ark_id license_slug = lookup_license_slug(meta["rights"]["uri"]) abstracts = [] for desc in meta["descriptions"]: if desc["type"] == "abstract": abstracts.append( ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) ) # print(abstracts) contribs = [] for creator in meta["creator"]: contribs.append( ReleaseContrib( given_name=creator["given"], surname=creator["family"], # sorry everybody raw_name="{} {}".format(creator["given"], creator["family"]), raw_affiliation=creator.get("affiliation"), role="author", # presumably, for these datasets? ) ) r = ReleaseEntity( ext_ids=ReleaseExtIds( doi=doi, ark=ark_id, ), title=clean(meta["title"], force_xml=True), publisher=clean(meta["publisher"]), release_year=int(meta["publicationYear"]), release_type="dataset", license_slug=license_slug, contribs=contribs, abstracts=abstracts or None, extra=extra, ) return r def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: if dat_path.endswith("/"): dat_path = dat_path[:-1] dat_discovery = dat_path extra = dict() assert len(dat_discovery) == 64 with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: meta_dict = json.loads(fp.read()) release = cdl_dash_release(meta_dict) ark_id = release.extra["ark_id"] dash_version = None # really crude XML parse-out with open(dat_path + "/stash-wrapper.xml", "r") as fp: for line in fp: line = line.strip() if line.startswith(""): dash_version = int(line[19:].split("<")[0]) assert dash_version is not None extra["cdl_dash"] = dict(version=dash_version) release.extra["cdl_dash"] = dict(version=dash_version) manifest = make_manifest(dat_path + "/files/") bundle_url = dict( url="https://merritt.cdlib.org/u/{}/{}".format( urllib.parse.quote(ark_id, safe=""), dash_version ), rel="repo-bundle", ) repo_url = dict( url="https://merritt.cdlib.org/d/{}/{}/".format( urllib.parse.quote(ark_id, safe=""), dash_version ), rel="repo", ) dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") fs = FilesetEntity( urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra ) return (release, fs) def auto_cdl_dash_dat( api: ApiClient, dat_path: str, release_id: Optional[str] = None, editgroup_id: Optional[str] = None, ) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") (release, fileset) = make_release_fileset(dat_path) if not editgroup_id: eg = api.create_editgroup( Editgroup( description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), ) ) editgroup_id = eg.editgroup_id if not release_id and release.ext_ids.doi: try: r = api.lookup_release(doi=release.ext_ids.doi) release_id = r.ident except fatcat_openapi_client.rest.ApiException: pass if not release_id: edit = api.create_release(eg.editgroup_id, release) release_id = edit.ident release = api.get_release(release_id, expand="filesets") if len(release.filesets): print("A fileset already exists for release {}".format(release.ident)) return (None, None, None) fileset.release_ids = [release.ident] edit = api.create_fileset(eg.editgroup_id, fileset) fileset = api.get_fileset(edit.ident) return (editgroup_id, release, fileset) if __name__ == "__main__": # pass this a discovery key that has been cloned to the local directory print(make_release_fileset(sys.argv[1]))