summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-19 23:00:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-03-19 23:00:50 -0700
commit1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed (patch)
treeab30bea42258f6b1c066efe5eff90b7110988954 /python/fatcat_tools/importers
parent3781f2ad3ced1930fa7771138e08f7947d864f68 (diff)
downloadfatcat-1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed.tar.gz
fatcat-1f6ef062c2aa8cd2b666a14d1906ccc26369b8ed.zip
importer for CDL/DASH dat pilot dweb datasets
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py199
2 files changed, 200 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index fe3db59d..2112785b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -19,5 +19,6 @@ from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
from .wayback_static import auto_wayback_static
+from .cdl_dash_dat import auto_cdl_dash_dat
#from .kafka_source import KafkaSource
#from .file_source import FileSource
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
new file mode 100755
index 00000000..3da51cce
--- /dev/null
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import magic
+import urllib
+import hashlib
+import mimetypes
+import subprocess
+
+import fatcat_client
+from fatcat_client import *
+from .common import clean
+
+# TODO:
+LICENSE_MAP = {
+ "https://creativecommons.org/licenses/by/4.0/": "CC-BY",
+}
+
+def single_file(prefix, path):
+
+ full = prefix + path
+ size_bytes = os.stat(full).st_size
+
+ hashes = [
+ hashlib.md5(),
+ hashlib.sha1(),
+ hashlib.sha256(),
+ ]
+ with open(full, 'rb') as fp:
+ while True:
+ data = fp.read(2**20)
+ if not data:
+ break
+ for h in hashes:
+ h.update(data)
+ mime = magic.Magic(mime=True).from_file(full)
+ if mime == 'application/octet-stream':
+ # magic apparently isn't that great; try using filename as well
+ guess = mimetypes.guess_type(full)[0]
+ if guess:
+ mime = guess
+
+ fsm = FilesetEntityManifest(
+ path=path,
+ size=size_bytes,
+ md5=hashes[0].hexdigest(),
+ sha1=hashes[1].hexdigest(),
+ sha256=hashes[2].hexdigest(),
+ extra=dict(mimetype=mime))
+ return fsm
+
+def make_manifest(base_dir):
+ manifest = []
+ for root, dirs, files in os.walk(base_dir):
+ for f in files:
+ manifest.append(single_file(root, f))
+ return manifest
+
+
+def cdl_dash_release(meta, extra=None):
+
+ if not extra:
+ extra = dict()
+
+ assert meta['identifier']['type'] == 'DOI'
+ doi = meta['identifier']['value'].lower()
+ assert doi.startswith('10.')
+
+ ark_id = None
+ for extid in meta.get('alternativeIdentifiers', []):
+ if extid['value'].startswith('ark:'):
+ ark_id = extid['value']
+ assert ark_id
+ extra['ark_id'] = ark_id
+
+ license_slug = LICENSE_MAP.get(meta['rights']['uri'])
+ if meta['rights']['uri'] == 'https://creativecommons.org/licenses/by/4.0/':
+ license_slug = 'CC-BY'
+
+ abstracts = []
+ for desc in meta['descriptions']:
+ if desc['type'] == "abstract":
+ abstracts.append(ReleaseEntityAbstracts(
+ mimetype="text/html",
+ content=clean(desc['value'])))
+ #print(abstracts)
+ if not abstracts:
+ abstracts = None
+
+ contribs = []
+ for creator in meta['creator']:
+ contribs.append(ReleaseContrib(
+ # TODO: given_name=creator['given'],
+ # TODO: surname=creator['family'],
+ # sorry everybody
+ raw_name="{} {}".format(creator['given'], creator['family']),
+ raw_affiliation=creator.get('affiliation'),
+ role="author", # presumably, for these datasets?
+ ))
+
+ r = ReleaseEntity(
+ doi=doi,
+ title=clean(meta['title'], force_xml=True),
+ publisher=clean(meta['publisher']),
+ release_year=int(meta['publicationYear']),
+ release_type="dataset",
+ contribs=contribs,
+ abstracts=abstracts,
+ extra=extra,
+ )
+ return r
+
+def make_release_fileset(dat_path):
+
+ if dat_path.endswith('/'):
+ dat_path = dat_path[:-1]
+ dat_discovery = dat_path
+ extra = dict()
+ assert len(dat_discovery) == 64
+
+ with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp:
+ meta_dict = json.loads(fp.read())
+
+ release = cdl_dash_release(meta_dict)
+ ark_id = release.extra['ark_id']
+
+ dash_version = None
+ # really crude XML parse-out
+ with open(dat_path + "/stash-wrapper.xml", 'r') as fp:
+ for line in fp:
+ line = line.strip()
+ if line.startswith("<st:version_number>"):
+ dash_version = int(line[19:].split('<')[0])
+ assert dash_version is not None
+ extra['cdl_dash'] = dict(version=dash_version)
+ release.extra['cdl_dash'] = dict(version=dash_version)
+
+ manifest = make_manifest(dat_path + "/files/")
+
+ bundle_url = dict(
+ url="https://merritt.cdlib.org/u/{}/{}".format(
+ urllib.parse.quote(ark_id, safe=''),
+ dash_version),
+ rel="repo-bundle")
+ repo_url = dict(
+ url="https://merritt.cdlib.org/d/{}/{}/".format(
+ urllib.parse.quote(ark_id, safe=''),
+ dash_version),
+ rel="repo")
+ dat_url = dict(
+ url="dat://{}/files/".format(dat_discovery),
+ rel="dweb")
+ fs = FilesetEntity(
+ urls=[bundle_url, repo_url, dat_url],
+ release_ids=None,
+ manifest=manifest,
+ extra=extra)
+ return (release, fs)
+
+def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
+
+ git_rev = subprocess.check_output(
+ ["git", "describe", "--always"]).strip().decode('utf-8')
+
+ (release, fileset) = make_release_fileset(dat_path)
+
+ if not editgroup_id:
+ eg = api.create_editgroup(Editgroup(
+ description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
+ extra=dict(
+ git_rev=git_rev,
+ agent="fatcat_tools.auto_cdl_dash_dat")))
+ editgroup_id = eg.editgroup_id
+
+ if not release_id and release.doi:
+ try:
+ r = api.lookup_release(doi=release.doi)
+ release_id = r.ident
+ except fatcat_client.rest.ApiException:
+ pass
+ if not release_id:
+ edit = api.create_release(release, editgroup_id=editgroup_id)
+ release_id = edit.ident
+
+ release = api.get_release(release_id, expand="filesets")
+ if len(release.filesets):
+ print("A fileset already exists for release {}".format(release.ident))
+ return (None, None, None)
+
+ fileset.release_ids = [release.ident]
+ edit = api.create_fileset(fileset, editgroup_id=editgroup_id)
+ fileset = api.get_fileset(edit.ident)
+ return (editgroup_id, release, fileset)
+
+if __name__=='__main__':
+ # pass this a discovery key that has been cloned to the local directory
+ print(make_release_fileset(sys.argv[1]))