summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/cdl_dash_dat.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/cdl_dash_dat.py')
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py219
1 files changed, 0 insertions, 219 deletions
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index 1a4114a0..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- FilesetEntity,
- FilesetFile,
- ReleaseAbstract,
- ReleaseContrib,
- ReleaseEntity,
- ReleaseExtIds,
-)
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
- full = prefix + path
- size_bytes = os.stat(full).st_size
-
- hashes = [
- hashlib.md5(),
- hashlib.sha1(),
- hashlib.sha256(),
- ]
- with open(full, "rb") as fp:
- while True:
- data = fp.read(2 ** 20)
- if not data:
- break
- for h in hashes:
- h.update(data)
- mime = magic.Magic(mime=True).from_file(full)
- if mime == "application/octet-stream":
- # magic apparently isn't that great; try using filename as well
- guess = mimetypes.guess_type(full)[0]
- if guess:
- mime = guess
-
- fsf = FilesetFile(
- path=path,
- size=size_bytes,
- md5=hashes[0].hexdigest(),
- sha1=hashes[1].hexdigest(),
- sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime),
- )
- return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
- manifest = []
- for root, dirs, files in os.walk(base_dir):
- for f in files:
- manifest.append(single_file(root, f))
- return manifest
-
-
-def cdl_dash_release(
- meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
- if not extra:
- extra = dict()
-
- assert meta["identifier"]["type"] == "DOI"
- doi = meta["identifier"]["value"].lower()
- assert doi.startswith("10.")
-
- ark_id = None
- for extid in meta.get("alternativeIdentifiers", []):
- if extid["value"].startswith("ark:"):
- ark_id = extid["value"]
- assert ark_id
-
- license_slug = lookup_license_slug(meta["rights"]["uri"])
-
- abstracts = []
- for desc in meta["descriptions"]:
- if desc["type"] == "abstract":
- abstracts.append(
- ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
- )
- # print(abstracts)
-
- contribs = []
- for creator in meta["creator"]:
- contribs.append(
- ReleaseContrib(
- given_name=creator["given"],
- surname=creator["family"],
- # sorry everybody
- raw_name="{} {}".format(creator["given"], creator["family"]),
- raw_affiliation=creator.get("affiliation"),
- role="author", # presumably, for these datasets?
- )
- )
-
- r = ReleaseEntity(
- ext_ids=ReleaseExtIds(
- doi=doi,
- ark=ark_id,
- ),
- title=clean(meta["title"], force_xml=True),
- publisher=clean(meta["publisher"]),
- release_year=int(meta["publicationYear"]),
- release_type="dataset",
- license_slug=license_slug,
- contribs=contribs,
- abstracts=abstracts or None,
- extra=extra,
- )
- return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
- if dat_path.endswith("/"):
- dat_path = dat_path[:-1]
- dat_discovery = dat_path
- extra = dict()
- assert len(dat_discovery) == 64
-
- with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
- meta_dict = json.loads(fp.read())
-
- release = cdl_dash_release(meta_dict)
- ark_id = release.extra["ark_id"]
-
- dash_version = None
- # really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", "r") as fp:
- for line in fp:
- line = line.strip()
- if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split("<")[0])
- assert dash_version is not None
- extra["cdl_dash"] = dict(version=dash_version)
- release.extra["cdl_dash"] = dict(version=dash_version)
-
- manifest = make_manifest(dat_path + "/files/")
-
- bundle_url = dict(
- url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo-bundle",
- )
- repo_url = dict(
- url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo",
- )
- dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
- fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
- )
- return (release, fs)
-
-
-def auto_cdl_dash_dat(
- api: ApiClient,
- dat_path: str,
- release_id: Optional[str] = None,
- editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- (release, fileset) = make_release_fileset(dat_path)
-
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
- )
- )
- editgroup_id = eg.editgroup_id
-
- if not release_id and release.ext_ids.doi:
- try:
- r = api.lookup_release(doi=release.ext_ids.doi)
- release_id = r.ident
- except fatcat_openapi_client.rest.ApiException:
- pass
- if not release_id:
- edit = api.create_release(eg.editgroup_id, release)
- release_id = edit.ident
-
- release = api.get_release(release_id, expand="filesets")
- if len(release.filesets):
- print("A fileset already exists for release {}".format(release.ident))
- return (None, None, None)
-
- fileset.release_ids = [release.ident]
- edit = api.create_fileset(eg.editgroup_id, fileset)
- fileset = api.get_fileset(edit.ident)
- return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
- # pass this a discovery key that has been cloned to the local directory
- print(make_release_fileset(sys.argv[1]))