diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/cdl_dash_dat.py | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/cdl_dash_dat.py')
-rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 219 |
1 files changed, 0 insertions, 219 deletions
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index 1a4114a0..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - FilesetEntity, - FilesetFile, - ReleaseAbstract, - ReleaseContrib, - ReleaseEntity, - ReleaseExtIds, -) - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - - full = prefix + path - size_bytes = os.stat(full).st_size - - hashes = [ - hashlib.md5(), - hashlib.sha1(), - hashlib.sha256(), - ] - with open(full, "rb") as fp: - while True: - data = fp.read(2 ** 20) - if not data: - break - for h in hashes: - h.update(data) - mime = magic.Magic(mime=True).from_file(full) - if mime == "application/octet-stream": - # magic apparently isn't that great; try using filename as well - guess = mimetypes.guess_type(full)[0] - if guess: - mime = guess - - fsf = FilesetFile( - path=path, - size=size_bytes, - md5=hashes[0].hexdigest(), - sha1=hashes[1].hexdigest(), - sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime), - ) - return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: - manifest = [] - for root, dirs, files in os.walk(base_dir): - for f in files: - manifest.append(single_file(root, f)) - return manifest - - -def cdl_dash_release( - meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - - if not extra: - extra = dict() - - assert meta["identifier"]["type"] == "DOI" - doi = meta["identifier"]["value"].lower() - assert doi.startswith("10.") - - ark_id = None - for extid in meta.get("alternativeIdentifiers", []): - if extid["value"].startswith("ark:"): - ark_id = extid["value"] - assert ark_id - - license_slug = lookup_license_slug(meta["rights"]["uri"]) - - abstracts = [] - for desc in meta["descriptions"]: - if desc["type"] == "abstract": - abstracts.append( - ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) - ) - # print(abstracts) - - contribs = [] - for creator in meta["creator"]: - contribs.append( - ReleaseContrib( - given_name=creator["given"], - surname=creator["family"], - # sorry everybody - raw_name="{} {}".format(creator["given"], creator["family"]), - raw_affiliation=creator.get("affiliation"), - role="author", # presumably, for these datasets? - ) - ) - - r = ReleaseEntity( - ext_ids=ReleaseExtIds( - doi=doi, - ark=ark_id, - ), - title=clean(meta["title"], force_xml=True), - publisher=clean(meta["publisher"]), - release_year=int(meta["publicationYear"]), - release_type="dataset", - license_slug=license_slug, - contribs=contribs, - abstracts=abstracts or None, - extra=extra, - ) - return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - - if dat_path.endswith("/"): - dat_path = dat_path[:-1] - dat_discovery = dat_path - extra = dict() - assert len(dat_discovery) == 64 - - with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: - meta_dict = json.loads(fp.read()) - - release = cdl_dash_release(meta_dict) - ark_id = release.extra["ark_id"] - - dash_version = None - # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", "r") as fp: - for line in fp: - line = line.strip() - if line.startswith("<st:version_number>"): - dash_version = int(line[19:].split("<")[0]) - assert dash_version is not None - extra["cdl_dash"] = dict(version=dash_version) - release.extra["cdl_dash"] = dict(version=dash_version) - - manifest = make_manifest(dat_path + "/files/") - - bundle_url = dict( - url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo-bundle", - ) - repo_url = dict( - url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo", - ) - dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") - fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra - ) - return (release, fs) - - -def auto_cdl_dash_dat( - api: ApiClient, - dat_path: str, - release_id: Optional[str] = None, - editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - (release, fileset) = make_release_fileset(dat_path) - - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), - ) - ) - editgroup_id = eg.editgroup_id - - if not release_id and release.ext_ids.doi: - try: - r = api.lookup_release(doi=release.ext_ids.doi) - release_id = r.ident - except fatcat_openapi_client.rest.ApiException: - pass - if not release_id: - edit = api.create_release(eg.editgroup_id, release) - release_id = edit.ident - - release = api.get_release(release_id, expand="filesets") - if len(release.filesets): - print("A fileset already exists for release {}".format(release.ident)) - return (None, None, None) - - fileset.release_ids = [release.ident] - edit = api.create_fileset(eg.editgroup_id, fileset) - fileset = api.get_fileset(edit.ident) - return (editgroup_id, release, fileset) - - -if __name__ == "__main__": - # pass this a discovery key that has been cloned to the local directory - print(make_release_fileset(sys.argv[1])) |