diff options
| -rwxr-xr-x | python/fatcat_import.py | 86 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 221 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 287 | 
4 files changed, 0 insertions, 596 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 39ef200a..33679868 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -42,8 +42,6 @@ from fatcat_tools.importers import (      SavePaperNowWebImporter,      ShadowLibraryImporter,      SqlitePusher, -    auto_cdl_dash_dat, -    auto_wayback_static,  )  # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable @@ -315,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None:      JsonLinePusher(fmi, args.json_file).run() -def run_wayback_static(args: argparse.Namespace) -> None: -    api = args.api - -    # find the release -    if args.release_id: -        release_id = args.release_id -    elif args.extid: -        idtype = args.extid.split(":")[0] -        extid = ":".join(args.extid.split(":")[1:]) -        if idtype == "doi": -            release_id = api.lookup_release(doi=extid).ident -        elif idtype == "pmid": -            release_id = api.lookup_release(pmid=extid).ident -        elif idtype == "wikidata": -            release_id = api.lookup_release(wikidata_qid=extid).ident -        else: -            raise NotImplementedError("extid type: {}".format(idtype)) -    else: -        raise Exception("need either release_id or extid argument") - -    # create it -    (editgroup_id, wc) = auto_wayback_static( -        api, release_id, args.wayback_url, editgroup_id=args.editgroup_id -    ) -    if not wc: -        return -    print("release_id: {}".format(release_id)) -    print("editgroup_id: {}".format(editgroup_id)) -    print("webcapture id: {}".format(wc.ident)) -    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) - - -def run_cdl_dash_dat(args: argparse.Namespace) -> None: -    api = args.api - -    # create it -    (editgroup_id, release, fs) = auto_cdl_dash_dat( -        api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id -    ) -    if not (fs and release): -        return -    print("release_id: {}".format(release.ident)) -    print("editgroup_id: {}".format(editgroup_id)) -    print("fileset id: {}".format(fs.ident)) -    print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) - -  def run_datacite(args: argparse.Namespace) -> None:      dci = DataciteImporter(          args.api, @@ -899,43 +850,6 @@ def main() -> None:          type=argparse.FileType("r"),      ) -    sub_wayback_static = subparsers.add_parser( -        "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" -    ) -    sub_wayback_static.set_defaults( -        func=run_wayback_static, -        auth_var="FATCAT_API_AUTH_TOKEN", -    ) -    sub_wayback_static.add_argument( -        "wayback_url", type=str, help="URL of wayback capture to extract from" -    ) -    sub_wayback_static.add_argument( -        "--extid", type=str, help="external identifier for release lookup" -    ) -    sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") -    sub_wayback_static.add_argument( -        "--editgroup-id", -        type=str, -        help="use existing editgroup (instead of creating a new one)", -    ) - -    sub_cdl_dash_dat = subparsers.add_parser( -        "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" -    ) -    sub_cdl_dash_dat.set_defaults( -        func=run_cdl_dash_dat, -        auth_var="FATCAT_API_AUTH_TOKEN", -    ) -    sub_cdl_dash_dat.add_argument( -        "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" -    ) -    sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") -    sub_cdl_dash_dat.add_argument( -        "--editgroup-id", -        type=str, -        help="use existing editgroup (instead of creating a new one)", -    ) -      sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")      sub_datacite.add_argument(          "json_file", diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 06ecfd58..223ae526 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:  from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter  from .arxiv import ArxivRawImporter -from .cdl_dash_dat import auto_cdl_dash_dat  from .chocula import ChoculaImporter  from .common import (      LANG_MAP_MARC, @@ -55,4 +54,3 @@ from .matched import MatchedImporter  from .orcid import OrcidImporter  from .pubmed import PubmedImporter  from .shadow import ShadowLibraryImporter -from .wayback_static import auto_wayback_static diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index ec557e15..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( -    ApiClient, -    Editgroup, -    FilesetEntity, -    FilesetFile, -    ReleaseAbstract, -    ReleaseContrib, -    ReleaseEntity, -    ReleaseExtIds, -) - -from fatcat_tools.normal import clean_doi - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - -    full = prefix + path -    size_bytes = os.stat(full).st_size - -    hashes = [ -        hashlib.md5(), -        hashlib.sha1(), -        hashlib.sha256(), -    ] -    with open(full, "rb") as fp: -        while True: -            data = fp.read(2 ** 20) -            if not data: -                break -            for h in hashes: -                h.update(data) -    mime = magic.Magic(mime=True).from_file(full) -    if mime == "application/octet-stream": -        # magic apparently isn't that great; try using filename as well -        guess = mimetypes.guess_type(full)[0] -        if guess: -            mime = guess - -    fsf = FilesetFile( -        path=path, -        size=size_bytes, -        md5=hashes[0].hexdigest(), -        sha1=hashes[1].hexdigest(), -        sha256=hashes[2].hexdigest(), -        extra=dict(mimetype=mime), -    ) -    return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: -    manifest = [] -    for root, dirs, files in os.walk(base_dir): -        for f in files: -            manifest.append(single_file(root, f)) -    return manifest - - -def cdl_dash_release( -    meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - -    if not extra: -        extra = dict() - -    assert meta["identifier"]["type"] == "DOI" -    doi = clean_doi(meta["identifier"]["value"].lower()) -    assert doi and doi.startswith("10.") - -    ark_id = None -    for extid in meta.get("alternativeIdentifiers", []): -        if extid["value"].startswith("ark:"): -            ark_id = extid["value"] -    assert ark_id - -    license_slug = lookup_license_slug(meta["rights"]["uri"]) - -    abstracts = [] -    for desc in meta["descriptions"]: -        if desc["type"] == "abstract": -            abstracts.append( -                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) -            ) -            # print(abstracts) - -    contribs = [] -    for creator in meta["creator"]: -        contribs.append( -            ReleaseContrib( -                given_name=creator["given"], -                surname=creator["family"], -                # sorry everybody -                raw_name="{} {}".format(creator["given"], creator["family"]), -                raw_affiliation=creator.get("affiliation"), -                role="author",  # presumably, for these datasets? -            ) -        ) - -    r = ReleaseEntity( -        ext_ids=ReleaseExtIds( -            doi=doi, -            ark=ark_id, -        ), -        title=clean(meta["title"], force_xml=True), -        publisher=clean(meta["publisher"]), -        release_year=int(meta["publicationYear"]), -        release_type="dataset", -        license_slug=license_slug, -        contribs=contribs, -        abstracts=abstracts or None, -        extra=extra, -    ) -    return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - -    if dat_path.endswith("/"): -        dat_path = dat_path[:-1] -    dat_discovery = dat_path -    extra = dict() -    assert len(dat_discovery) == 64 - -    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: -        meta_dict = json.loads(fp.read()) - -    release = cdl_dash_release(meta_dict) -    ark_id = release.extra["ark_id"] - -    dash_version = None -    # really crude XML parse-out -    with open(dat_path + "/stash-wrapper.xml", "r") as fp: -        for line in fp: -            line = line.strip() -            if line.startswith("<st:version_number>"): -                dash_version = int(line[19:].split("<")[0]) -    assert dash_version is not None -    extra["cdl_dash"] = dict(version=dash_version) -    release.extra["cdl_dash"] = dict(version=dash_version) - -    manifest = make_manifest(dat_path + "/files/") - -    bundle_url = dict( -        url="https://merritt.cdlib.org/u/{}/{}".format( -            urllib.parse.quote(ark_id, safe=""), dash_version -        ), -        rel="repo-bundle", -    ) -    repo_url = dict( -        url="https://merritt.cdlib.org/d/{}/{}/".format( -            urllib.parse.quote(ark_id, safe=""), dash_version -        ), -        rel="repo", -    ) -    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") -    fs = FilesetEntity( -        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra -    ) -    return (release, fs) - - -def auto_cdl_dash_dat( -    api: ApiClient, -    dat_path: str, -    release_id: Optional[str] = None, -    editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - -    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - -    (release, fileset) = make_release_fileset(dat_path) - -    if not editgroup_id: -        eg = api.create_editgroup( -            Editgroup( -                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", -                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), -            ) -        ) -        editgroup_id = eg.editgroup_id - -    if not release_id and release.ext_ids.doi: -        try: -            r = api.lookup_release(doi=release.ext_ids.doi) -            release_id = r.ident -        except fatcat_openapi_client.rest.ApiException: -            pass -    if not release_id: -        edit = api.create_release(eg.editgroup_id, release) -        release_id = edit.ident - -    release = api.get_release(release_id, expand="filesets") -    if len(release.filesets): -        print("A fileset already exists for release {}".format(release.ident)) -        return (None, None, None) - -    fileset.release_ids = [release.ident] -    edit = api.create_fileset(eg.editgroup_id, fileset) -    fileset = api.get_fileset(edit.ident) -    return (editgroup_id, release, fileset) - - -if __name__ == "__main__": -    # pass this a discovery key that has been cloned to the local directory -    print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py deleted file mode 100755 index 5caed2c7..00000000 --- a/python/fatcat_tools/importers/wayback_static.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -""" -Helpers to create Web Capture entities from extracted wayback content. - -Works as a stand-alone script (for debugging) or as library routines. -""" - -import argparse -import datetime -import hashlib -import json -import subprocess -import sys -from typing import Any, Dict, List, Optional, Tuple - -import requests -from bs4 import BeautifulSoup -from fatcat_openapi_client import ( -    ApiClient, -    Editgroup, -    EntityEdit, -    WebcaptureCdxLine, -    WebcaptureEntity, -    WebcaptureUrl, -) - -from .common import b32_hex - -CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" -GWB_URL_BASE = "https://web.archive.org/web" -REQ_SESSION = requests.Session() - - -def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: -    """Takes a wayback machine URL, and returns a tuple: - -    (timestamp, datetime, original_url) -    """ -    chunks = url.split("/") -    assert len(chunks) >= 6 -    assert chunks[2] == "web.archive.org" -    assert chunks[3] == "web" -    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) - - -def test_parse_wbm_url() -> None: -    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" -    assert parse_wbm_url(u) == ( -        "20010712114837", -        datetime.datetime(2001, 7, 12, 11, 48, 37), -        "http://www.dlib.org/dlib/june01/reich/06reich.html", -    ) - - -def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: -    """ -    Takes a complete WBM timestamp string (like "20020327115625") and returns a -    python datetime object (UTC) -    """ -    # strip any "im_" or "id_" suffix -    if timestamp.endswith("_"): -        timestamp = timestamp[:-3] -    # inflexible; require the full second-precision timestamp -    assert len(timestamp) == 14 -    return datetime.datetime( -        year=int(timestamp[0:4]), -        month=int(timestamp[4:6]), -        day=int(timestamp[6:8]), -        hour=int(timestamp[8:10]), -        minute=int(timestamp[10:12]), -        second=int(timestamp[12:14]), -    ) - - -def test_parse_wbm_timestamp() -> None: -    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) - - -def fetch_wbm(url: str) -> bytes: -    resp = REQ_SESSION.get(url) -    resp.raise_for_status() -    assert resp.content -    return resp.content - - -def lookup_cdx( -    embed_url: str, verify_hashes: bool = True, cdx_output: Any = None -) -> Optional[WebcaptureCdxLine]: -    sys.stderr.write(embed_url + "\n") -    assert embed_url.startswith("/web/") -    embed_url_segments = embed_url.split("/") -    timestamp = embed_url_segments[2] -    if timestamp.endswith("_"): -        timestamp = timestamp[:-3] -    url = "/".join(embed_url_segments[3:]) -    # print((timestamp, url)) -    params: Dict = dict( -        url=url, -        closest=timestamp, -        sort="closest", -        resolveRevisits="true", -        matchType="exact", -        limit=1, -    ) -    resp = REQ_SESSION.get( -        CDX_API_BASE, -        params=params, -    ) -    resp.raise_for_status() -    # print(resp.url) -    if resp.content: -        hit = resp.content.decode("utf-8").split("\n")[0] -        if cdx_output: -            cdx_output.write(hit + "\n") -        cdx_chunks = hit.split(" ") -        cdx = [x if (x and x != "-") else None for x in cdx_chunks] -        webcapture_cdx = WebcaptureCdxLine( -            surt=cdx[0], -            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", -            url=cdx[2], -            mimetype=cdx[3], -            status_code=int(cdx[4] or ""), -            sha1=b32_hex(cdx[5] or ""), -            sha256=None, -        ) -        if verify_hashes: -            resp = REQ_SESSION.get( -                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp -            ) -            resp.raise_for_status() -            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() -            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() -            webcapture_cdx.size = len(resp.content) -        return webcapture_cdx -    else: -        return None - - -def wayback_url_to_relative(url: str) -> Optional[str]: -    """ -    Wayback URLs can be relative or absolute in rewritten documents. This -    function converts any form of rewritten URL to a relative (to -    web.archive.org) one, or returns None if it isn't a rewritten URL at all. -    """ -    if url.startswith("https://web.archive.org/"): -        url = url[23:] -    elif url.startswith("http://web.archive.org/"): -        url = url[22:] - -    if url.startswith("/web/"): -        return url -    else: -        return None - - -def extract_embeds(soup: BeautifulSoup) -> List[str]: - -    embeds = set() - -    # <link href=""> -    for tag in soup.find_all("link", href=True): -        if tag["rel"] not in ("stylesheet",): -            continue -        url = wayback_url_to_relative(tag["href"]) -        if url: -            embeds.add(url) -    # <img src=""> -    for tag in soup.find_all("img", src=True): -        url = wayback_url_to_relative(tag["src"]) -        if url: -            embeds.add(url) - -    # <script src=""> -    for tag in soup.find_all("script", src=True): -        url = wayback_url_to_relative(tag["src"]) -        if url: -            embeds.add(url) - -    return list(embeds) - - -def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: -    """ -    Given a complete wayback machine capture URL, like: - -        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html - -    Will return a new ("bare") fatcat webcapture entity python object, with all -    the CDX entries filled in. -    """ - -    wbm_html = fetch_wbm(wayback_url) -    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    # with open(rewritten_path, 'r') as fp: -    #    soup = BeautifulSoup(fp, "lxml") -    soup = BeautifulSoup(wbm_html, "lxml") -    embeds = extract_embeds(soup) -    cdx_obj = lookup_cdx( -        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output -    ) -    cdx_list = [cdx_obj] -    for url in embeds: -        cdx_obj = lookup_cdx(url, cdx_output=cdx_output) -        cdx_list.append(cdx_obj) -    archive_urls = [ -        WebcaptureUrl( -            rel="wayback", -            url="https://web.archive.org/web/", -        ) -    ] -    wc = WebcaptureEntity( -        cdx=cdx_list, -        timestamp=timestamp.isoformat() + "Z", -        original_url=original_url, -        archive_urls=archive_urls, -        release_ids=None, -    ) -    return wc - - -def auto_wayback_static( -    api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None -) -> Tuple[Optional[str], Optional[EntityEdit]]: -    """ -    Returns a tuple: (editgroup_id, edit). If failed, both are None -    """ - -    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - -    release = api.get_release(release_id, expand="webcaptures") - -    # check for existing webcapture with same parameters -    for wc in release.webcaptures: -        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): -            # skipping: already existed -            print( -                "release {} already had webcapture {} {}".format( -                    release_id, raw_timestamp, original_url -                ) -            ) -            return (None, None) - -    wc = static_wayback_webcapture(wayback_url) -    assert len(wc.cdx) >= 1 -    wc.release_ids = [release_id] -    if not editgroup_id: -        eg = api.create_editgroup( -            Editgroup( -                description="One-off import of static web content from wayback machine", -                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), -            ) -        ) -        editgroup_id = eg.editgroup_id -    edit = api.create_webcapture(eg.editgroup_id, wc) -    return (editgroup_id, edit) - - -def main() -> None: -    parser = argparse.ArgumentParser() -    parser.add_argument("--verbose", action="store_true", help="verbose output") -    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") -    parser.add_argument( -        "--json-output", -        type=argparse.FileType("w"), -        default=sys.stdout, -        help="where to write out webcapture entity (as JSON)", -    ) -    parser.add_argument( -        "--cdx-output", -        type=argparse.FileType("w"), -        default=None, -        help="(optional) file to write out CDX stub", -    ) - -    args = parser.parse_args() - -    # entity-to-JSON code; duplicate of entity_to_dict() -    api_client = ApiClient() -    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) -    wc_dict = api_client.sanitize_for_serialization(wc) -    print(json.dumps(wc_dict)) - - -if __name__ == "__main__": -    main() | 
