diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-19 23:00:50 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-19 23:00:50 -0700 | 
| commit | 1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed (patch) | |
| tree | ab30bea42258f6b1c066efe5eff90b7110988954 | |
| parent | 3781f2ad3ced1930fa7771138e08f7947d864f68 (diff) | |
| download | fatcat-1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed.tar.gz fatcat-1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed.zip | |
importer for CDL/DASH dat pilot dweb datasets
| -rwxr-xr-x | python/fatcat_import.py | 30 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 199 | 
3 files changed, 229 insertions, 1 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index ce5063de..aea8c757 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -65,9 +65,22 @@ def run_wayback_static(args):          return      print("release_id: {}".format(release_id))      print("editgroup_id: {}".format(editgroup_id)) -    print("edit id: {}".format(wc.ident)) +    print("webcapture id: {}".format(wc.ident))      print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) +def run_cdl_dash_dat(args): +    api = args.api + +    # create it +    (editgroup_id, release, fs) = auto_cdl_dash_dat(api, args.dat_path, +        release_id=args.release_id, editgroup_id=args.editgroup_id) +    if not fs: +        return +    print("release_id: {}".format(release.ident)) +    print("editgroup_id: {}".format(editgroup_id)) +    print("fileset id: {}".format(fs.ident)) +    print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) +  def main():      parser = argparse.ArgumentParser()      parser.add_argument('--debug', @@ -174,6 +187,21 @@ def main():          type=str,          help="use existing editgroup (instead of creating a new one)") +    sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat') +    sub_cdl_dash_dat.set_defaults( +        func=run_cdl_dash_dat, +        auth_var="FATCAT_API_AUTH_TOKEN", +    ) +    sub_cdl_dash_dat.add_argument('dat_path', +        type=str, +        help="local path dat to import (must be the dat discovery key)") +    sub_cdl_dash_dat.add_argument('--release-id', +        type=str, +        help="release entity identifier") +    sub_cdl_dash_dat.add_argument('--editgroup-id', +        type=str, +        help="use existing editgroup (instead of creating a new one)") +      args = parser.parse_args()      if not args.__dict__.get("func"):          print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index fe3db59d..2112785b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -19,5 +19,6 @@ from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter  from .orcid import OrcidImporter  from .wayback_static import auto_wayback_static +from .cdl_dash_dat import auto_cdl_dash_dat  #from .kafka_source import KafkaSource  #from .file_source import FileSource diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py new file mode 100755 index 00000000..3da51cce --- /dev/null +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +import magic +import urllib +import hashlib +import mimetypes +import subprocess + +import fatcat_client +from fatcat_client import * +from .common import clean + +# TODO: +LICENSE_MAP = { +    "https://creativecommons.org/licenses/by/4.0/": "CC-BY", +} + +def single_file(prefix, path): + +    full = prefix + path +    size_bytes = os.stat(full).st_size + +    hashes = [ +        hashlib.md5(), +        hashlib.sha1(), +        hashlib.sha256(), +    ] +    with open(full, 'rb') as fp: +        while True: +            data = fp.read(2**20) +            if not data: +                break +            for h in hashes: +                h.update(data) +    mime = magic.Magic(mime=True).from_file(full) +    if mime == 'application/octet-stream': +        # magic apparently isn't that great; try using filename as well +        guess = mimetypes.guess_type(full)[0] +        if guess: +            mime = guess + +    fsm = FilesetEntityManifest( +        path=path, +        size=size_bytes, +        md5=hashes[0].hexdigest(), +        sha1=hashes[1].hexdigest(), +        sha256=hashes[2].hexdigest(), +        extra=dict(mimetype=mime)) +    return fsm + +def make_manifest(base_dir): +    manifest = [] +    for root, dirs, files in os.walk(base_dir): +        for f in files: +            manifest.append(single_file(root, f)) +    return manifest + + +def cdl_dash_release(meta, extra=None): + +    if not extra: +        extra = dict() + +    assert meta['identifier']['type'] == 'DOI' +    doi = meta['identifier']['value'].lower() +    assert doi.startswith('10.') + +    ark_id = None +    for extid in meta.get('alternativeIdentifiers', []): +        if extid['value'].startswith('ark:'): +            ark_id = extid['value'] +    assert ark_id +    extra['ark_id'] = ark_id + +    license_slug = LICENSE_MAP.get(meta['rights']['uri']) +    if meta['rights']['uri'] == 'https://creativecommons.org/licenses/by/4.0/': +        license_slug = 'CC-BY' + +    abstracts = [] +    for desc in meta['descriptions']: +        if desc['type'] == "abstract": +            abstracts.append(ReleaseEntityAbstracts( +                mimetype="text/html", +                content=clean(desc['value']))) +            #print(abstracts) +    if not abstracts: +        abstracts = None +     +    contribs = [] +    for creator in meta['creator']: +        contribs.append(ReleaseContrib( +            # TODO: given_name=creator['given'], +            # TODO: surname=creator['family'], +            # sorry everybody +            raw_name="{} {}".format(creator['given'], creator['family']), +            raw_affiliation=creator.get('affiliation'), +            role="author", # presumably, for these datasets? +        )) + +    r = ReleaseEntity( +        doi=doi, +        title=clean(meta['title'], force_xml=True), +        publisher=clean(meta['publisher']), +        release_year=int(meta['publicationYear']), +        release_type="dataset", +        contribs=contribs, +        abstracts=abstracts, +        extra=extra, +    ) +    return r + +def make_release_fileset(dat_path): + +    if dat_path.endswith('/'): +        dat_path = dat_path[:-1] +    dat_discovery = dat_path +    extra = dict() +    assert len(dat_discovery) == 64 + +    with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: +        meta_dict = json.loads(fp.read()) +     +    release = cdl_dash_release(meta_dict) +    ark_id = release.extra['ark_id'] + +    dash_version = None +    # really crude XML parse-out +    with open(dat_path + "/stash-wrapper.xml", 'r') as fp: +        for line in fp: +            line = line.strip() +            if line.startswith("<st:version_number>"): +                dash_version = int(line[19:].split('<')[0]) +    assert dash_version is not None +    extra['cdl_dash'] = dict(version=dash_version) +    release.extra['cdl_dash'] = dict(version=dash_version) + +    manifest = make_manifest(dat_path + "/files/") + +    bundle_url = dict( +        url="https://merritt.cdlib.org/u/{}/{}".format( +            urllib.parse.quote(ark_id, safe=''), +            dash_version), +        rel="repo-bundle") +    repo_url = dict( +        url="https://merritt.cdlib.org/d/{}/{}/".format( +            urllib.parse.quote(ark_id, safe=''), +            dash_version), +        rel="repo") +    dat_url = dict( +        url="dat://{}/files/".format(dat_discovery), +        rel="dweb") +    fs = FilesetEntity( +        urls=[bundle_url, repo_url, dat_url], +        release_ids=None, +        manifest=manifest, +        extra=extra) +    return (release, fs) + +def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): + +    git_rev = subprocess.check_output( +        ["git", "describe", "--always"]).strip().decode('utf-8') + +    (release, fileset) = make_release_fileset(dat_path) + +    if not editgroup_id: +        eg = api.create_editgroup(Editgroup( +            description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", +            extra=dict( +                git_rev=git_rev, +                agent="fatcat_tools.auto_cdl_dash_dat"))) +        editgroup_id = eg.editgroup_id + +    if not release_id and release.doi: +        try: +            r = api.lookup_release(doi=release.doi) +            release_id = r.ident +        except fatcat_client.rest.ApiException: +            pass +    if not release_id: +        edit = api.create_release(release, editgroup_id=editgroup_id) +        release_id = edit.ident + +    release = api.get_release(release_id, expand="filesets") +    if len(release.filesets): +        print("A fileset already exists for release {}".format(release.ident)) +        return (None, None, None) + +    fileset.release_ids = [release.ident] +    edit = api.create_fileset(fileset, editgroup_id=editgroup_id) +    fileset = api.get_fileset(edit.ident) +    return (editgroup_id, release, fileset) + +if __name__=='__main__': +    # pass this a discovery key that has been cloned to the local directory +    print(make_release_fileset(sys.argv[1])) | 
