summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py221
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py287
3 files changed, 0 insertions, 510 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..223ae526 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:
from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
from .chocula import ChoculaImporter
from .common import (
LANG_MAP_MARC,
@@ -55,4 +54,3 @@ from .matched import MatchedImporter
from .orcid import OrcidImporter
from .pubmed import PubmedImporter
from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index ec557e15..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- FilesetEntity,
- FilesetFile,
- ReleaseAbstract,
- ReleaseContrib,
- ReleaseEntity,
- ReleaseExtIds,
-)
-
-from fatcat_tools.normal import clean_doi
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
- full = prefix + path
- size_bytes = os.stat(full).st_size
-
- hashes = [
- hashlib.md5(),
- hashlib.sha1(),
- hashlib.sha256(),
- ]
- with open(full, "rb") as fp:
- while True:
- data = fp.read(2 ** 20)
- if not data:
- break
- for h in hashes:
- h.update(data)
- mime = magic.Magic(mime=True).from_file(full)
- if mime == "application/octet-stream":
- # magic apparently isn't that great; try using filename as well
- guess = mimetypes.guess_type(full)[0]
- if guess:
- mime = guess
-
- fsf = FilesetFile(
- path=path,
- size=size_bytes,
- md5=hashes[0].hexdigest(),
- sha1=hashes[1].hexdigest(),
- sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime),
- )
- return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
- manifest = []
- for root, dirs, files in os.walk(base_dir):
- for f in files:
- manifest.append(single_file(root, f))
- return manifest
-
-
-def cdl_dash_release(
- meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
- if not extra:
- extra = dict()
-
- assert meta["identifier"]["type"] == "DOI"
- doi = clean_doi(meta["identifier"]["value"].lower())
- assert doi and doi.startswith("10.")
-
- ark_id = None
- for extid in meta.get("alternativeIdentifiers", []):
- if extid["value"].startswith("ark:"):
- ark_id = extid["value"]
- assert ark_id
-
- license_slug = lookup_license_slug(meta["rights"]["uri"])
-
- abstracts = []
- for desc in meta["descriptions"]:
- if desc["type"] == "abstract":
- abstracts.append(
- ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
- )
- # print(abstracts)
-
- contribs = []
- for creator in meta["creator"]:
- contribs.append(
- ReleaseContrib(
- given_name=creator["given"],
- surname=creator["family"],
- # sorry everybody
- raw_name="{} {}".format(creator["given"], creator["family"]),
- raw_affiliation=creator.get("affiliation"),
- role="author", # presumably, for these datasets?
- )
- )
-
- r = ReleaseEntity(
- ext_ids=ReleaseExtIds(
- doi=doi,
- ark=ark_id,
- ),
- title=clean(meta["title"], force_xml=True),
- publisher=clean(meta["publisher"]),
- release_year=int(meta["publicationYear"]),
- release_type="dataset",
- license_slug=license_slug,
- contribs=contribs,
- abstracts=abstracts or None,
- extra=extra,
- )
- return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
- if dat_path.endswith("/"):
- dat_path = dat_path[:-1]
- dat_discovery = dat_path
- extra = dict()
- assert len(dat_discovery) == 64
-
- with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
- meta_dict = json.loads(fp.read())
-
- release = cdl_dash_release(meta_dict)
- ark_id = release.extra["ark_id"]
-
- dash_version = None
- # really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", "r") as fp:
- for line in fp:
- line = line.strip()
- if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split("<")[0])
- assert dash_version is not None
- extra["cdl_dash"] = dict(version=dash_version)
- release.extra["cdl_dash"] = dict(version=dash_version)
-
- manifest = make_manifest(dat_path + "/files/")
-
- bundle_url = dict(
- url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo-bundle",
- )
- repo_url = dict(
- url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo",
- )
- dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
- fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
- )
- return (release, fs)
-
-
-def auto_cdl_dash_dat(
- api: ApiClient,
- dat_path: str,
- release_id: Optional[str] = None,
- editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- (release, fileset) = make_release_fileset(dat_path)
-
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
- )
- )
- editgroup_id = eg.editgroup_id
-
- if not release_id and release.ext_ids.doi:
- try:
- r = api.lookup_release(doi=release.ext_ids.doi)
- release_id = r.ident
- except fatcat_openapi_client.rest.ApiException:
- pass
- if not release_id:
- edit = api.create_release(eg.editgroup_id, release)
- release_id = edit.ident
-
- release = api.get_release(release_id, expand="filesets")
- if len(release.filesets):
- print("A fileset already exists for release {}".format(release.ident))
- return (None, None, None)
-
- fileset.release_ids = [release.ident]
- edit = api.create_fileset(eg.editgroup_id, fileset)
- fileset = api.get_fileset(edit.ident)
- return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
- # pass this a discovery key that has been cloned to the local directory
- print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- EntityEdit,
- WebcaptureCdxLine,
- WebcaptureEntity,
- WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
- """Takes a wayback machine URL, and returns a tuple:
-
- (timestamp, datetime, original_url)
- """
- chunks = url.split("/")
- assert len(chunks) >= 6
- assert chunks[2] == "web.archive.org"
- assert chunks[3] == "web"
- return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
- u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
- assert parse_wbm_url(u) == (
- "20010712114837",
- datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html",
- )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
- """
- Takes a complete WBM timestamp string (like "20020327115625") and returns a
- python datetime object (UTC)
- """
- # strip any "im_" or "id_" suffix
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- # inflexible; require the full second-precision timestamp
- assert len(timestamp) == 14
- return datetime.datetime(
- year=int(timestamp[0:4]),
- month=int(timestamp[4:6]),
- day=int(timestamp[6:8]),
- hour=int(timestamp[8:10]),
- minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]),
- )
-
-
-def test_parse_wbm_timestamp() -> None:
- assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
- resp = REQ_SESSION.get(url)
- resp.raise_for_status()
- assert resp.content
- return resp.content
-
-
-def lookup_cdx(
- embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
- sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith("/web/")
- embed_url_segments = embed_url.split("/")
- timestamp = embed_url_segments[2]
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- url = "/".join(embed_url_segments[3:])
- # print((timestamp, url))
- params: Dict = dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- )
- resp = REQ_SESSION.get(
- CDX_API_BASE,
- params=params,
- )
- resp.raise_for_status()
- # print(resp.url)
- if resp.content:
- hit = resp.content.decode("utf-8").split("\n")[0]
- if cdx_output:
- cdx_output.write(hit + "\n")
- cdx_chunks = hit.split(" ")
- cdx = [x if (x and x != "-") else None for x in cdx_chunks]
- webcapture_cdx = WebcaptureCdxLine(
- surt=cdx[0],
- timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
- url=cdx[2],
- mimetype=cdx[3],
- status_code=int(cdx[4] or ""),
- sha1=b32_hex(cdx[5] or ""),
- sha256=None,
- )
- if verify_hashes:
- resp = REQ_SESSION.get(
- GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
- )
- resp.raise_for_status()
- assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
- webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
- webcapture_cdx.size = len(resp.content)
- return webcapture_cdx
- else:
- return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
- """
- Wayback URLs can be relative or absolute in rewritten documents. This
- function converts any form of rewritten URL to a relative (to
- web.archive.org) one, or returns None if it isn't a rewritten URL at all.
- """
- if url.startswith("https://web.archive.org/"):
- url = url[23:]
- elif url.startswith("http://web.archive.org/"):
- url = url[22:]
-
- if url.startswith("/web/"):
- return url
- else:
- return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
- embeds = set()
-
- # <link href="">
- for tag in soup.find_all("link", href=True):
- if tag["rel"] not in ("stylesheet",):
- continue
- url = wayback_url_to_relative(tag["href"])
- if url:
- embeds.add(url)
- # <img src="">
- for tag in soup.find_all("img", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- # <script src="">
- for tag in soup.find_all("script", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
- """
- Given a complete wayback machine capture URL, like:
-
- http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
- Will return a new ("bare") fatcat webcapture entity python object, with all
- the CDX entries filled in.
- """
-
- wbm_html = fetch_wbm(wayback_url)
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- # with open(rewritten_path, 'r') as fp:
- # soup = BeautifulSoup(fp, "lxml")
- soup = BeautifulSoup(wbm_html, "lxml")
- embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx(
- "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
- )
- cdx_list = [cdx_obj]
- for url in embeds:
- cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
- cdx_list.append(cdx_obj)
- archive_urls = [
- WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )
- ]
- wc = WebcaptureEntity(
- cdx=cdx_list,
- timestamp=timestamp.isoformat() + "Z",
- original_url=original_url,
- archive_urls=archive_urls,
- release_ids=None,
- )
- return wc
-
-
-def auto_wayback_static(
- api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
- """
- Returns a tuple: (editgroup_id, edit). If failed, both are None
- """
-
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- release = api.get_release(release_id, expand="webcaptures")
-
- # check for existing webcapture with same parameters
- for wc in release.webcaptures:
- if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
- # skipping: already existed
- print(
- "release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url
- )
- )
- return (None, None)
-
- wc = static_wayback_webcapture(wayback_url)
- assert len(wc.cdx) >= 1
- wc.release_ids = [release_id]
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
- )
- )
- editgroup_id = eg.editgroup_id
- edit = api.create_webcapture(eg.editgroup_id, wc)
- return (editgroup_id, edit)
-
-
-def main() -> None:
- parser = argparse.ArgumentParser()
- parser.add_argument("--verbose", action="store_true", help="verbose output")
- parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
- parser.add_argument(
- "--json-output",
- type=argparse.FileType("w"),
- default=sys.stdout,
- help="where to write out webcapture entity (as JSON)",
- )
- parser.add_argument(
- "--cdx-output",
- type=argparse.FileType("w"),
- default=None,
- help="(optional) file to write out CDX stub",
- )
-
- args = parser.parse_args()
-
- # entity-to-JSON code; duplicate of entity_to_dict()
- api_client = ApiClient()
- wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
- wc_dict = api_client.sanitize_for_serialization(wc)
- print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
- main()