4 files changed, 0 insertions, 596 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 39ef200a..33679868 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -42,8 +42,6 @@ from fatcat_tools.importers import (
     SavePaperNowWebImporter,
     ShadowLibraryImporter,
     SqlitePusher,
-    auto_cdl_dash_dat,
-    auto_wayback_static,
 )
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
@@ -315,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None:
     JsonLinePusher(fmi, args.json_file).run()
 
 
-def run_wayback_static(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # find the release
-    if args.release_id:
-        release_id = args.release_id
-    elif args.extid:
-        idtype = args.extid.split(":")[0]
-        extid = ":".join(args.extid.split(":")[1:])
-        if idtype == "doi":
-            release_id = api.lookup_release(doi=extid).ident
-        elif idtype == "pmid":
-            release_id = api.lookup_release(pmid=extid).ident
-        elif idtype == "wikidata":
-            release_id = api.lookup_release(wikidata_qid=extid).ident
-        else:
-            raise NotImplementedError("extid type: {}".format(idtype))
-    else:
-        raise Exception("need either release_id or extid argument")
-
-    # create it
-    (editgroup_id, wc) = auto_wayback_static(
-        api, release_id, args.wayback_url, editgroup_id=args.editgroup_id
-    )
-    if not wc:
-        return
-    print("release_id: {}".format(release_id))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("webcapture id: {}".format(wc.ident))
-    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
-
-
-def run_cdl_dash_dat(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # create it
-    (editgroup_id, release, fs) = auto_cdl_dash_dat(
-        api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id
-    )
-    if not (fs and release):
-        return
-    print("release_id: {}".format(release.ident))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("fileset id: {}".format(fs.ident))
-    print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
-
-
 def run_datacite(args: argparse.Namespace) -> None:
     dci = DataciteImporter(
         args.api,
@@ -899,43 +850,6 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
 
-    sub_wayback_static = subparsers.add_parser(
-        "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback"
-    )
-    sub_wayback_static.set_defaults(
-        func=run_wayback_static,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_wayback_static.add_argument(
-        "wayback_url", type=str, help="URL of wayback capture to extract from"
-    )
-    sub_wayback_static.add_argument(
-        "--extid", type=str, help="external identifier for release lookup"
-    )
-    sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_wayback_static.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
-    sub_cdl_dash_dat = subparsers.add_parser(
-        "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project"
-    )
-    sub_cdl_dash_dat.set_defaults(
-        func=run_cdl_dash_dat,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_cdl_dash_dat.add_argument(
-        "dat_path", type=str, help="local path dat to import (must be the dat discovery key)"
-    )
-    sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_cdl_dash_dat.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
     sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")
     sub_datacite.add_argument(
         "json_file",
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..223ae526 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:
 
 from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
 from .chocula import ChoculaImporter
 from .common import (
     LANG_MAP_MARC,
@@ -55,4 +54,3 @@ from .matched import MatchedImporter
 from .orcid import OrcidImporter
 from .pubmed import PubmedImporter
 from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index ec557e15..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    FilesetEntity,
-    FilesetFile,
-    ReleaseAbstract,
-    ReleaseContrib,
-    ReleaseEntity,
-    ReleaseExtIds,
-)
-
-from fatcat_tools.normal import clean_doi
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
-    full = prefix + path
-    size_bytes = os.stat(full).st_size
-
-    hashes = [
-        hashlib.md5(),
-        hashlib.sha1(),
-        hashlib.sha256(),
-    ]
-    with open(full, "rb") as fp:
-        while True:
-            data = fp.read(2 ** 20)
-            if not data:
-                break
-            for h in hashes:
-                h.update(data)
-    mime = magic.Magic(mime=True).from_file(full)
-    if mime == "application/octet-stream":
-        # magic apparently isn't that great; try using filename as well
-        guess = mimetypes.guess_type(full)[0]
-        if guess:
-            mime = guess
-
-    fsf = FilesetFile(
-        path=path,
-        size=size_bytes,
-        md5=hashes[0].hexdigest(),
-        sha1=hashes[1].hexdigest(),
-        sha256=hashes[2].hexdigest(),
-        extra=dict(mimetype=mime),
-    )
-    return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
-    manifest = []
-    for root, dirs, files in os.walk(base_dir):
-        for f in files:
-            manifest.append(single_file(root, f))
-    return manifest
-
-
-def cdl_dash_release(
-    meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
-    if not extra:
-        extra = dict()
-
-    assert meta["identifier"]["type"] == "DOI"
-    doi = clean_doi(meta["identifier"]["value"].lower())
-    assert doi and doi.startswith("10.")
-
-    ark_id = None
-    for extid in meta.get("alternativeIdentifiers", []):
-        if extid["value"].startswith("ark:"):
-            ark_id = extid["value"]
-    assert ark_id
-
-    license_slug = lookup_license_slug(meta["rights"]["uri"])
-
-    abstracts = []
-    for desc in meta["descriptions"]:
-        if desc["type"] == "abstract":
-            abstracts.append(
-                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
-            )
-            # print(abstracts)
-
-    contribs = []
-    for creator in meta["creator"]:
-        contribs.append(
-            ReleaseContrib(
-                given_name=creator["given"],
-                surname=creator["family"],
-                # sorry everybody
-                raw_name="{} {}".format(creator["given"], creator["family"]),
-                raw_affiliation=creator.get("affiliation"),
-                role="author",  # presumably, for these datasets?
-            )
-        )
-
-    r = ReleaseEntity(
-        ext_ids=ReleaseExtIds(
-            doi=doi,
-            ark=ark_id,
-        ),
-        title=clean(meta["title"], force_xml=True),
-        publisher=clean(meta["publisher"]),
-        release_year=int(meta["publicationYear"]),
-        release_type="dataset",
-        license_slug=license_slug,
-        contribs=contribs,
-        abstracts=abstracts or None,
-        extra=extra,
-    )
-    return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
-    if dat_path.endswith("/"):
-        dat_path = dat_path[:-1]
-    dat_discovery = dat_path
-    extra = dict()
-    assert len(dat_discovery) == 64
-
-    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
-        meta_dict = json.loads(fp.read())
-
-    release = cdl_dash_release(meta_dict)
-    ark_id = release.extra["ark_id"]
-
-    dash_version = None
-    # really crude XML parse-out
-    with open(dat_path + "/stash-wrapper.xml", "r") as fp:
-        for line in fp:
-            line = line.strip()
-            if line.startswith("<st:version_number>"):
-                dash_version = int(line[19:].split("<")[0])
-    assert dash_version is not None
-    extra["cdl_dash"] = dict(version=dash_version)
-    release.extra["cdl_dash"] = dict(version=dash_version)
-
-    manifest = make_manifest(dat_path + "/files/")
-
-    bundle_url = dict(
-        url="https://merritt.cdlib.org/u/{}/{}".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo-bundle",
-    )
-    repo_url = dict(
-        url="https://merritt.cdlib.org/d/{}/{}/".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo",
-    )
-    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
-    fs = FilesetEntity(
-        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
-    )
-    return (release, fs)
-
-
-def auto_cdl_dash_dat(
-    api: ApiClient,
-    dat_path: str,
-    release_id: Optional[str] = None,
-    editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    (release, fileset) = make_release_fileset(dat_path)
-
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-
-    if not release_id and release.ext_ids.doi:
-        try:
-            r = api.lookup_release(doi=release.ext_ids.doi)
-            release_id = r.ident
-        except fatcat_openapi_client.rest.ApiException:
-            pass
-    if not release_id:
-        edit = api.create_release(eg.editgroup_id, release)
-        release_id = edit.ident
-
-    release = api.get_release(release_id, expand="filesets")
-    if len(release.filesets):
-        print("A fileset already exists for release {}".format(release.ident))
-        return (None, None, None)
-
-    fileset.release_ids = [release.ident]
-    edit = api.create_fileset(eg.editgroup_id, fileset)
-    fileset = api.get_fileset(edit.ident)
-    return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
-    # pass this a discovery key that has been cloned to the local directory
-    print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    EntityEdit,
-    WebcaptureCdxLine,
-    WebcaptureEntity,
-    WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
-    """Takes a wayback machine URL, and returns a tuple:
-
-    (timestamp, datetime, original_url)
-    """
-    chunks = url.split("/")
-    assert len(chunks) >= 6
-    assert chunks[2] == "web.archive.org"
-    assert chunks[3] == "web"
-    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
-    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
-    assert parse_wbm_url(u) == (
-        "20010712114837",
-        datetime.datetime(2001, 7, 12, 11, 48, 37),
-        "http://www.dlib.org/dlib/june01/reich/06reich.html",
-    )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
-    """
-    Takes a complete WBM timestamp string (like "20020327115625") and returns a
-    python datetime object (UTC)
-    """
-    # strip any "im_" or "id_" suffix
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    # inflexible; require the full second-precision timestamp
-    assert len(timestamp) == 14
-    return datetime.datetime(
-        year=int(timestamp[0:4]),
-        month=int(timestamp[4:6]),
-        day=int(timestamp[6:8]),
-        hour=int(timestamp[8:10]),
-        minute=int(timestamp[10:12]),
-        second=int(timestamp[12:14]),
-    )
-
-
-def test_parse_wbm_timestamp() -> None:
-    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
-    resp = REQ_SESSION.get(url)
-    resp.raise_for_status()
-    assert resp.content
-    return resp.content
-
-
-def lookup_cdx(
-    embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
-    sys.stderr.write(embed_url + "\n")
-    assert embed_url.startswith("/web/")
-    embed_url_segments = embed_url.split("/")
-    timestamp = embed_url_segments[2]
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    url = "/".join(embed_url_segments[3:])
-    # print((timestamp, url))
-    params: Dict = dict(
-        url=url,
-        closest=timestamp,
-        sort="closest",
-        resolveRevisits="true",
-        matchType="exact",
-        limit=1,
-    )
-    resp = REQ_SESSION.get(
-        CDX_API_BASE,
-        params=params,
-    )
-    resp.raise_for_status()
-    # print(resp.url)
-    if resp.content:
-        hit = resp.content.decode("utf-8").split("\n")[0]
-        if cdx_output:
-            cdx_output.write(hit + "\n")
-        cdx_chunks = hit.split(" ")
-        cdx = [x if (x and x != "-") else None for x in cdx_chunks]
-        webcapture_cdx = WebcaptureCdxLine(
-            surt=cdx[0],
-            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
-            url=cdx[2],
-            mimetype=cdx[3],
-            status_code=int(cdx[4] or ""),
-            sha1=b32_hex(cdx[5] or ""),
-            sha256=None,
-        )
-        if verify_hashes:
-            resp = REQ_SESSION.get(
-                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp
-            )
-            resp.raise_for_status()
-            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
-            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
-            webcapture_cdx.size = len(resp.content)
-        return webcapture_cdx
-    else:
-        return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
-    """
-    Wayback URLs can be relative or absolute in rewritten documents. This
-    function converts any form of rewritten URL to a relative (to
-    web.archive.org) one, or returns None if it isn't a rewritten URL at all.
-    """
-    if url.startswith("https://web.archive.org/"):
-        url = url[23:]
-    elif url.startswith("http://web.archive.org/"):
-        url = url[22:]
-
-    if url.startswith("/web/"):
-        return url
-    else:
-        return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
-    embeds = set()
-
-    # <link href="">
-    for tag in soup.find_all("link", href=True):
-        if tag["rel"] not in ("stylesheet",):
-            continue
-        url = wayback_url_to_relative(tag["href"])
-        if url:
-            embeds.add(url)
-    # <img src="">
-    for tag in soup.find_all("img", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    # <script src="">
-    for tag in soup.find_all("script", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
-    """
-    Given a complete wayback machine capture URL, like:
-
-        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
-    Will return a new ("bare") fatcat webcapture entity python object, with all
-    the CDX entries filled in.
-    """
-
-    wbm_html = fetch_wbm(wayback_url)
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    # with open(rewritten_path, 'r') as fp:
-    #    soup = BeautifulSoup(fp, "lxml")
-    soup = BeautifulSoup(wbm_html, "lxml")
-    embeds = extract_embeds(soup)
-    cdx_obj = lookup_cdx(
-        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
-    )
-    cdx_list = [cdx_obj]
-    for url in embeds:
-        cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
-        cdx_list.append(cdx_obj)
-    archive_urls = [
-        WebcaptureUrl(
-            rel="wayback",
-            url="https://web.archive.org/web/",
-        )
-    ]
-    wc = WebcaptureEntity(
-        cdx=cdx_list,
-        timestamp=timestamp.isoformat() + "Z",
-        original_url=original_url,
-        archive_urls=archive_urls,
-        release_ids=None,
-    )
-    return wc
-
-
-def auto_wayback_static(
-    api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
-    """
-    Returns a tuple: (editgroup_id, edit). If failed, both are None
-    """
-
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    release = api.get_release(release_id, expand="webcaptures")
-
-    # check for existing webcapture with same parameters
-    for wc in release.webcaptures:
-        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
-            # skipping: already existed
-            print(
-                "release {} already had webcapture {} {}".format(
-                    release_id, raw_timestamp, original_url
-                )
-            )
-            return (None, None)
-
-    wc = static_wayback_webcapture(wayback_url)
-    assert len(wc.cdx) >= 1
-    wc.release_ids = [release_id]
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of static web content from wayback machine",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-    edit = api.create_webcapture(eg.editgroup_id, wc)
-    return (editgroup_id, edit)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--verbose", action="store_true", help="verbose output")
-    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
-    parser.add_argument(
-        "--json-output",
-        type=argparse.FileType("w"),
-        default=sys.stdout,
-        help="where to write out webcapture entity (as JSON)",
-    )
-    parser.add_argument(
-        "--cdx-output",
-        type=argparse.FileType("w"),
-        default=None,
-        help="(optional) file to write out CDX stub",
-    )
-
-    args = parser.parse_args()
-
-    # entity-to-JSON code; duplicate of entity_to_dict()
-    api_client = ApiClient()
-    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
-    wc_dict = api_client.sanitize_for_serialization(wc)
-    print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
-    main()