diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 221 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 287 |
3 files changed, 0 insertions, 510 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 06ecfd58..223ae526 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -13,7 +13,6 @@ To run an import you combine two classes; one each of: from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter from .arxiv import ArxivRawImporter -from .cdl_dash_dat import auto_cdl_dash_dat from .chocula import ChoculaImporter from .common import ( LANG_MAP_MARC, @@ -55,4 +54,3 @@ from .matched import MatchedImporter from .orcid import OrcidImporter from .pubmed import PubmedImporter from .shadow import ShadowLibraryImporter -from .wayback_static import auto_wayback_static diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index ec557e15..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - FilesetEntity, - FilesetFile, - ReleaseAbstract, - ReleaseContrib, - ReleaseEntity, - ReleaseExtIds, -) - -from fatcat_tools.normal import clean_doi - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - - full = prefix + path - size_bytes = os.stat(full).st_size - - hashes = [ - hashlib.md5(), - hashlib.sha1(), - hashlib.sha256(), - ] - with open(full, "rb") as fp: - while True: - data = fp.read(2 ** 20) - if not data: - break - for h in hashes: - h.update(data) - mime = magic.Magic(mime=True).from_file(full) - if mime == "application/octet-stream": - # magic apparently isn't that great; try using filename as well - guess = mimetypes.guess_type(full)[0] - if guess: - mime = guess - - fsf = FilesetFile( - path=path, - size=size_bytes, - md5=hashes[0].hexdigest(), - sha1=hashes[1].hexdigest(), - sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime), - ) - return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: - manifest = [] - for root, dirs, files in os.walk(base_dir): - for f in files: - manifest.append(single_file(root, f)) - return manifest - - -def cdl_dash_release( - meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - - if not extra: - extra = dict() - - assert meta["identifier"]["type"] == "DOI" - doi = clean_doi(meta["identifier"]["value"].lower()) - assert doi and doi.startswith("10.") - - ark_id = None - for extid in meta.get("alternativeIdentifiers", []): - if extid["value"].startswith("ark:"): - ark_id = extid["value"] - assert ark_id - - license_slug = lookup_license_slug(meta["rights"]["uri"]) - - abstracts = [] - for desc in meta["descriptions"]: - if desc["type"] == "abstract": - abstracts.append( - ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) - ) - # print(abstracts) - - contribs = [] - for creator in meta["creator"]: - contribs.append( - ReleaseContrib( - given_name=creator["given"], - surname=creator["family"], - # sorry everybody - raw_name="{} {}".format(creator["given"], creator["family"]), - raw_affiliation=creator.get("affiliation"), - role="author", # presumably, for these datasets? - ) - ) - - r = ReleaseEntity( - ext_ids=ReleaseExtIds( - doi=doi, - ark=ark_id, - ), - title=clean(meta["title"], force_xml=True), - publisher=clean(meta["publisher"]), - release_year=int(meta["publicationYear"]), - release_type="dataset", - license_slug=license_slug, - contribs=contribs, - abstracts=abstracts or None, - extra=extra, - ) - return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - - if dat_path.endswith("/"): - dat_path = dat_path[:-1] - dat_discovery = dat_path - extra = dict() - assert len(dat_discovery) == 64 - - with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: - meta_dict = json.loads(fp.read()) - - release = cdl_dash_release(meta_dict) - ark_id = release.extra["ark_id"] - - dash_version = None - # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", "r") as fp: - for line in fp: - line = line.strip() - if line.startswith("<st:version_number>"): - dash_version = int(line[19:].split("<")[0]) - assert dash_version is not None - extra["cdl_dash"] = dict(version=dash_version) - release.extra["cdl_dash"] = dict(version=dash_version) - - manifest = make_manifest(dat_path + "/files/") - - bundle_url = dict( - url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo-bundle", - ) - repo_url = dict( - url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo", - ) - dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") - fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra - ) - return (release, fs) - - -def auto_cdl_dash_dat( - api: ApiClient, - dat_path: str, - release_id: Optional[str] = None, - editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - (release, fileset) = make_release_fileset(dat_path) - - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), - ) - ) - editgroup_id = eg.editgroup_id - - if not release_id and release.ext_ids.doi: - try: - r = api.lookup_release(doi=release.ext_ids.doi) - release_id = r.ident - except fatcat_openapi_client.rest.ApiException: - pass - if not release_id: - edit = api.create_release(eg.editgroup_id, release) - release_id = edit.ident - - release = api.get_release(release_id, expand="filesets") - if len(release.filesets): - print("A fileset already exists for release {}".format(release.ident)) - return (None, None, None) - - fileset.release_ids = [release.ident] - edit = api.create_fileset(eg.editgroup_id, fileset) - fileset = api.get_fileset(edit.ident) - return (editgroup_id, release, fileset) - - -if __name__ == "__main__": - # pass this a discovery key that has been cloned to the local directory - print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py deleted file mode 100755 index 5caed2c7..00000000 --- a/python/fatcat_tools/importers/wayback_static.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -""" -Helpers to create Web Capture entities from extracted wayback content. - -Works as a stand-alone script (for debugging) or as library routines. -""" - -import argparse -import datetime -import hashlib -import json -import subprocess -import sys -from typing import Any, Dict, List, Optional, Tuple - -import requests -from bs4 import BeautifulSoup -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - EntityEdit, - WebcaptureCdxLine, - WebcaptureEntity, - WebcaptureUrl, -) - -from .common import b32_hex - -CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" -GWB_URL_BASE = "https://web.archive.org/web" -REQ_SESSION = requests.Session() - - -def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: - """Takes a wayback machine URL, and returns a tuple: - - (timestamp, datetime, original_url) - """ - chunks = url.split("/") - assert len(chunks) >= 6 - assert chunks[2] == "web.archive.org" - assert chunks[3] == "web" - return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) - - -def test_parse_wbm_url() -> None: - u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" - assert parse_wbm_url(u) == ( - "20010712114837", - datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html", - ) - - -def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: - """ - Takes a complete WBM timestamp string (like "20020327115625") and returns a - python datetime object (UTC) - """ - # strip any "im_" or "id_" suffix - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - # inflexible; require the full second-precision timestamp - assert len(timestamp) == 14 - return datetime.datetime( - year=int(timestamp[0:4]), - month=int(timestamp[4:6]), - day=int(timestamp[6:8]), - hour=int(timestamp[8:10]), - minute=int(timestamp[10:12]), - second=int(timestamp[12:14]), - ) - - -def test_parse_wbm_timestamp() -> None: - assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) - - -def fetch_wbm(url: str) -> bytes: - resp = REQ_SESSION.get(url) - resp.raise_for_status() - assert resp.content - return resp.content - - -def lookup_cdx( - embed_url: str, verify_hashes: bool = True, cdx_output: Any = None -) -> Optional[WebcaptureCdxLine]: - sys.stderr.write(embed_url + "\n") - assert embed_url.startswith("/web/") - embed_url_segments = embed_url.split("/") - timestamp = embed_url_segments[2] - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - url = "/".join(embed_url_segments[3:]) - # print((timestamp, url)) - params: Dict = dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - ) - resp = REQ_SESSION.get( - CDX_API_BASE, - params=params, - ) - resp.raise_for_status() - # print(resp.url) - if resp.content: - hit = resp.content.decode("utf-8").split("\n")[0] - if cdx_output: - cdx_output.write(hit + "\n") - cdx_chunks = hit.split(" ") - cdx = [x if (x and x != "-") else None for x in cdx_chunks] - webcapture_cdx = WebcaptureCdxLine( - surt=cdx[0], - timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", - url=cdx[2], - mimetype=cdx[3], - status_code=int(cdx[4] or ""), - sha1=b32_hex(cdx[5] or ""), - sha256=None, - ) - if verify_hashes: - resp = REQ_SESSION.get( - GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp - ) - resp.raise_for_status() - assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() - webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() - webcapture_cdx.size = len(resp.content) - return webcapture_cdx - else: - return None - - -def wayback_url_to_relative(url: str) -> Optional[str]: - """ - Wayback URLs can be relative or absolute in rewritten documents. This - function converts any form of rewritten URL to a relative (to - web.archive.org) one, or returns None if it isn't a rewritten URL at all. - """ - if url.startswith("https://web.archive.org/"): - url = url[23:] - elif url.startswith("http://web.archive.org/"): - url = url[22:] - - if url.startswith("/web/"): - return url - else: - return None - - -def extract_embeds(soup: BeautifulSoup) -> List[str]: - - embeds = set() - - # <link href=""> - for tag in soup.find_all("link", href=True): - if tag["rel"] not in ("stylesheet",): - continue - url = wayback_url_to_relative(tag["href"]) - if url: - embeds.add(url) - # <img src=""> - for tag in soup.find_all("img", src=True): - url = wayback_url_to_relative(tag["src"]) - if url: - embeds.add(url) - - # <script src=""> - for tag in soup.find_all("script", src=True): - url = wayback_url_to_relative(tag["src"]) - if url: - embeds.add(url) - - return list(embeds) - - -def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: - """ - Given a complete wayback machine capture URL, like: - - http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html - - Will return a new ("bare") fatcat webcapture entity python object, with all - the CDX entries filled in. - """ - - wbm_html = fetch_wbm(wayback_url) - raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - # with open(rewritten_path, 'r') as fp: - # soup = BeautifulSoup(fp, "lxml") - soup = BeautifulSoup(wbm_html, "lxml") - embeds = extract_embeds(soup) - cdx_obj = lookup_cdx( - "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output - ) - cdx_list = [cdx_obj] - for url in embeds: - cdx_obj = lookup_cdx(url, cdx_output=cdx_output) - cdx_list.append(cdx_obj) - archive_urls = [ - WebcaptureUrl( - rel="wayback", - url="https://web.archive.org/web/", - ) - ] - wc = WebcaptureEntity( - cdx=cdx_list, - timestamp=timestamp.isoformat() + "Z", - original_url=original_url, - archive_urls=archive_urls, - release_ids=None, - ) - return wc - - -def auto_wayback_static( - api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None -) -> Tuple[Optional[str], Optional[EntityEdit]]: - """ - Returns a tuple: (editgroup_id, edit). If failed, both are None - """ - - raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - release = api.get_release(release_id, expand="webcaptures") - - # check for existing webcapture with same parameters - for wc in release.webcaptures: - if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): - # skipping: already existed - print( - "release {} already had webcapture {} {}".format( - release_id, raw_timestamp, original_url - ) - ) - return (None, None) - - wc = static_wayback_webcapture(wayback_url) - assert len(wc.cdx) >= 1 - wc.release_ids = [release_id] - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of static web content from wayback machine", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), - ) - ) - editgroup_id = eg.editgroup_id - edit = api.create_webcapture(eg.editgroup_id, wc) - return (editgroup_id, edit) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--verbose", action="store_true", help="verbose output") - parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") - parser.add_argument( - "--json-output", - type=argparse.FileType("w"), - default=sys.stdout, - help="where to write out webcapture entity (as JSON)", - ) - parser.add_argument( - "--cdx-output", - type=argparse.FileType("w"), - default=None, - help="(optional) file to write out CDX stub", - ) - - args = parser.parse_args() - - # entity-to-JSON code; duplicate of entity_to_dict() - api_client = ApiClient() - wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) - wc_dict = api_client.sanitize_for_serialization(wc) - print(json.dumps(wc_dict)) - - -if __name__ == "__main__": - main() |