From 1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Mar 2019 23:00:50 -0700 Subject: importer for CDL/DASH dat pilot dweb datasets --- python/fatcat_import.py | 30 +++- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/cdl_dash_dat.py | 199 ++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 1 deletion(-) create mode 100755 python/fatcat_tools/importers/cdl_dash_dat.py diff --git a/python/fatcat_import.py b/python/fatcat_import.py index ce5063de..aea8c757 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -65,9 +65,22 @@ def run_wayback_static(args): return print("release_id: {}".format(release_id)) print("editgroup_id: {}".format(editgroup_id)) - print("edit id: {}".format(wc.ident)) + print("webcapture id: {}".format(wc.ident)) print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) +def run_cdl_dash_dat(args): + api = args.api + + # create it + (editgroup_id, release, fs) = auto_cdl_dash_dat(api, args.dat_path, + release_id=args.release_id, editgroup_id=args.editgroup_id) + if not fs: + return + print("release_id: {}".format(release.ident)) + print("editgroup_id: {}".format(editgroup_id)) + print("fileset id: {}".format(fs.ident)) + print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) + def main(): parser = argparse.ArgumentParser() parser.add_argument('--debug', @@ -174,6 +187,21 @@ def main(): type=str, help="use existing editgroup (instead of creating a new one)") + sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat') + sub_cdl_dash_dat.set_defaults( + func=run_cdl_dash_dat, + auth_var="FATCAT_API_AUTH_TOKEN", + ) + sub_cdl_dash_dat.add_argument('dat_path', + type=str, + help="local path dat to import (must be the dat discovery key)") + sub_cdl_dash_dat.add_argument('--release-id', + type=str, + help="release entity identifier") + sub_cdl_dash_dat.add_argument('--editgroup-id', + type=str, + help="use existing editgroup (instead of creating a new one)") + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index fe3db59d..2112785b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -19,5 +19,6 @@ from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter from .wayback_static import auto_wayback_static +from .cdl_dash_dat import auto_cdl_dash_dat #from .kafka_source import KafkaSource #from .file_source import FileSource diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py new file mode 100755 index 00000000..3da51cce --- /dev/null +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +import magic +import urllib +import hashlib +import mimetypes +import subprocess + +import fatcat_client +from fatcat_client import * +from .common import clean + +# TODO: +LICENSE_MAP = { + "https://creativecommons.org/licenses/by/4.0/": "CC-BY", +} + +def single_file(prefix, path): + + full = prefix + path + size_bytes = os.stat(full).st_size + + hashes = [ + hashlib.md5(), + hashlib.sha1(), + hashlib.sha256(), + ] + with open(full, 'rb') as fp: + while True: + data = fp.read(2**20) + if not data: + break + for h in hashes: + h.update(data) + mime = magic.Magic(mime=True).from_file(full) + if mime == 'application/octet-stream': + # magic apparently isn't that great; try using filename as well + guess = mimetypes.guess_type(full)[0] + if guess: + mime = guess + + fsm = FilesetEntityManifest( + path=path, + size=size_bytes, + md5=hashes[0].hexdigest(), + sha1=hashes[1].hexdigest(), + sha256=hashes[2].hexdigest(), + extra=dict(mimetype=mime)) + return fsm + +def make_manifest(base_dir): + manifest = [] + for root, dirs, files in os.walk(base_dir): + for f in files: + manifest.append(single_file(root, f)) + return manifest + + +def cdl_dash_release(meta, extra=None): + + if not extra: + extra = dict() + + assert meta['identifier']['type'] == 'DOI' + doi = meta['identifier']['value'].lower() + assert doi.startswith('10.') + + ark_id = None + for extid in meta.get('alternativeIdentifiers', []): + if extid['value'].startswith('ark:'): + ark_id = extid['value'] + assert ark_id + extra['ark_id'] = ark_id + + license_slug = LICENSE_MAP.get(meta['rights']['uri']) + if meta['rights']['uri'] == 'https://creativecommons.org/licenses/by/4.0/': + license_slug = 'CC-BY' + + abstracts = [] + for desc in meta['descriptions']: + if desc['type'] == "abstract": + abstracts.append(ReleaseEntityAbstracts( + mimetype="text/html", + content=clean(desc['value']))) + #print(abstracts) + if not abstracts: + abstracts = None + + contribs = [] + for creator in meta['creator']: + contribs.append(ReleaseContrib( + # TODO: given_name=creator['given'], + # TODO: surname=creator['family'], + # sorry everybody + raw_name="{} {}".format(creator['given'], creator['family']), + raw_affiliation=creator.get('affiliation'), + role="author", # presumably, for these datasets? + )) + + r = ReleaseEntity( + doi=doi, + title=clean(meta['title'], force_xml=True), + publisher=clean(meta['publisher']), + release_year=int(meta['publicationYear']), + release_type="dataset", + contribs=contribs, + abstracts=abstracts, + extra=extra, + ) + return r + +def make_release_fileset(dat_path): + + if dat_path.endswith('/'): + dat_path = dat_path[:-1] + dat_discovery = dat_path + extra = dict() + assert len(dat_discovery) == 64 + + with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: + meta_dict = json.loads(fp.read()) + + release = cdl_dash_release(meta_dict) + ark_id = release.extra['ark_id'] + + dash_version = None + # really crude XML parse-out + with open(dat_path + "/stash-wrapper.xml", 'r') as fp: + for line in fp: + line = line.strip() + if line.startswith(""): + dash_version = int(line[19:].split('<')[0]) + assert dash_version is not None + extra['cdl_dash'] = dict(version=dash_version) + release.extra['cdl_dash'] = dict(version=dash_version) + + manifest = make_manifest(dat_path + "/files/") + + bundle_url = dict( + url="https://merritt.cdlib.org/u/{}/{}".format( + urllib.parse.quote(ark_id, safe=''), + dash_version), + rel="repo-bundle") + repo_url = dict( + url="https://merritt.cdlib.org/d/{}/{}/".format( + urllib.parse.quote(ark_id, safe=''), + dash_version), + rel="repo") + dat_url = dict( + url="dat://{}/files/".format(dat_discovery), + rel="dweb") + fs = FilesetEntity( + urls=[bundle_url, repo_url, dat_url], + release_ids=None, + manifest=manifest, + extra=extra) + return (release, fs) + +def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): + + git_rev = subprocess.check_output( + ["git", "describe", "--always"]).strip().decode('utf-8') + + (release, fileset) = make_release_fileset(dat_path) + + if not editgroup_id: + eg = api.create_editgroup(Editgroup( + description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", + extra=dict( + git_rev=git_rev, + agent="fatcat_tools.auto_cdl_dash_dat"))) + editgroup_id = eg.editgroup_id + + if not release_id and release.doi: + try: + r = api.lookup_release(doi=release.doi) + release_id = r.ident + except fatcat_client.rest.ApiException: + pass + if not release_id: + edit = api.create_release(release, editgroup_id=editgroup_id) + release_id = edit.ident + + release = api.get_release(release_id, expand="filesets") + if len(release.filesets): + print("A fileset already exists for release {}".format(release.ident)) + return (None, None, None) + + fileset.release_ids = [release.ident] + edit = api.create_fileset(fileset, editgroup_id=editgroup_id) + fileset = api.get_fileset(edit.ident) + return (editgroup_id, release, fileset) + +if __name__=='__main__': + # pass this a discovery key that has been cloned to the local directory + print(make_release_fileset(sys.argv[1])) -- cgit v1.2.3