aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/cdl_dash_dat.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:08:23 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:08:25 -0800
commitab4e1355bf93e3755985f1b5cd2589a78601d253 (patch)
treef50ee1492587fead94410e229963b18f88f203a9 /python/fatcat_tools/importers/cdl_dash_dat.py
parentc133f3077aa975aa4706a8e5ca894fc1b71fbc67 (diff)
downloadfatcat-ab4e1355bf93e3755985f1b5cd2589a78601d253.tar.gz
fatcat-ab4e1355bf93e3755985f1b5cd2589a78601d253.zip
remove cdl_dash_dat and wayback_static importers
Cleaning out dead code. These importers were used to create demonstration fileset and webcapture entities early in development. They have been replaced by the fileset and webcapture ingest importers.
Diffstat (limited to 'python/fatcat_tools/importers/cdl_dash_dat.py')
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py221
1 files changed, 0 insertions, 221 deletions
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index ec557e15..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- FilesetEntity,
- FilesetFile,
- ReleaseAbstract,
- ReleaseContrib,
- ReleaseEntity,
- ReleaseExtIds,
-)
-
-from fatcat_tools.normal import clean_doi
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
- full = prefix + path
- size_bytes = os.stat(full).st_size
-
- hashes = [
- hashlib.md5(),
- hashlib.sha1(),
- hashlib.sha256(),
- ]
- with open(full, "rb") as fp:
- while True:
- data = fp.read(2 ** 20)
- if not data:
- break
- for h in hashes:
- h.update(data)
- mime = magic.Magic(mime=True).from_file(full)
- if mime == "application/octet-stream":
- # magic apparently isn't that great; try using filename as well
- guess = mimetypes.guess_type(full)[0]
- if guess:
- mime = guess
-
- fsf = FilesetFile(
- path=path,
- size=size_bytes,
- md5=hashes[0].hexdigest(),
- sha1=hashes[1].hexdigest(),
- sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime),
- )
- return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
- manifest = []
- for root, dirs, files in os.walk(base_dir):
- for f in files:
- manifest.append(single_file(root, f))
- return manifest
-
-
-def cdl_dash_release(
- meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
- if not extra:
- extra = dict()
-
- assert meta["identifier"]["type"] == "DOI"
- doi = clean_doi(meta["identifier"]["value"].lower())
- assert doi and doi.startswith("10.")
-
- ark_id = None
- for extid in meta.get("alternativeIdentifiers", []):
- if extid["value"].startswith("ark:"):
- ark_id = extid["value"]
- assert ark_id
-
- license_slug = lookup_license_slug(meta["rights"]["uri"])
-
- abstracts = []
- for desc in meta["descriptions"]:
- if desc["type"] == "abstract":
- abstracts.append(
- ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
- )
- # print(abstracts)
-
- contribs = []
- for creator in meta["creator"]:
- contribs.append(
- ReleaseContrib(
- given_name=creator["given"],
- surname=creator["family"],
- # sorry everybody
- raw_name="{} {}".format(creator["given"], creator["family"]),
- raw_affiliation=creator.get("affiliation"),
- role="author", # presumably, for these datasets?
- )
- )
-
- r = ReleaseEntity(
- ext_ids=ReleaseExtIds(
- doi=doi,
- ark=ark_id,
- ),
- title=clean(meta["title"], force_xml=True),
- publisher=clean(meta["publisher"]),
- release_year=int(meta["publicationYear"]),
- release_type="dataset",
- license_slug=license_slug,
- contribs=contribs,
- abstracts=abstracts or None,
- extra=extra,
- )
- return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
- if dat_path.endswith("/"):
- dat_path = dat_path[:-1]
- dat_discovery = dat_path
- extra = dict()
- assert len(dat_discovery) == 64
-
- with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
- meta_dict = json.loads(fp.read())
-
- release = cdl_dash_release(meta_dict)
- ark_id = release.extra["ark_id"]
-
- dash_version = None
- # really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", "r") as fp:
- for line in fp:
- line = line.strip()
- if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split("<")[0])
- assert dash_version is not None
- extra["cdl_dash"] = dict(version=dash_version)
- release.extra["cdl_dash"] = dict(version=dash_version)
-
- manifest = make_manifest(dat_path + "/files/")
-
- bundle_url = dict(
- url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo-bundle",
- )
- repo_url = dict(
- url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo",
- )
- dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
- fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
- )
- return (release, fs)
-
-
-def auto_cdl_dash_dat(
- api: ApiClient,
- dat_path: str,
- release_id: Optional[str] = None,
- editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- (release, fileset) = make_release_fileset(dat_path)
-
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
- )
- )
- editgroup_id = eg.editgroup_id
-
- if not release_id and release.ext_ids.doi:
- try:
- r = api.lookup_release(doi=release.ext_ids.doi)
- release_id = r.ident
- except fatcat_openapi_client.rest.ApiException:
- pass
- if not release_id:
- edit = api.create_release(eg.editgroup_id, release)
- release_id = edit.ident
-
- release = api.get_release(release_id, expand="filesets")
- if len(release.filesets):
- print("A fileset already exists for release {}".format(release.ident))
- return (None, None, None)
-
- fileset.release_ids = [release.ident]
- edit = api.create_fileset(eg.editgroup_id, fileset)
- fileset = api.get_fileset(edit.ident)
- return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
- # pass this a discovery key that has been cloned to the local directory
- print(make_release_fileset(sys.argv[1]))