summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:08:23 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:08:25 -0800
commitab4e1355bf93e3755985f1b5cd2589a78601d253 (patch)
treef50ee1492587fead94410e229963b18f88f203a9 /python/fatcat_tools/importers
parentc133f3077aa975aa4706a8e5ca894fc1b71fbc67 (diff)
downloadfatcat-ab4e1355bf93e3755985f1b5cd2589a78601d253.tar.gz
fatcat-ab4e1355bf93e3755985f1b5cd2589a78601d253.zip
remove cdl_dash_dat and wayback_static importers
Cleaning out dead code. These importers were used to create demonstration fileset and webcapture entities early in development. They have been replaced by the fileset and webcapture ingest importers.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py221
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py287
3 files changed, 0 insertions, 510 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..223ae526 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:
from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
from .chocula import ChoculaImporter
from .common import (
LANG_MAP_MARC,
@@ -55,4 +54,3 @@ from .matched import MatchedImporter
from .orcid import OrcidImporter
from .pubmed import PubmedImporter
from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index ec557e15..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- FilesetEntity,
- FilesetFile,
- ReleaseAbstract,
- ReleaseContrib,
- ReleaseEntity,
- ReleaseExtIds,
-)
-
-from fatcat_tools.normal import clean_doi
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
- full = prefix + path
- size_bytes = os.stat(full).st_size
-
- hashes = [
- hashlib.md5(),
- hashlib.sha1(),
- hashlib.sha256(),
- ]
- with open(full, "rb") as fp:
- while True:
- data = fp.read(2 ** 20)
- if not data:
- break
- for h in hashes:
- h.update(data)
- mime = magic.Magic(mime=True).from_file(full)
- if mime == "application/octet-stream":
- # magic apparently isn't that great; try using filename as well
- guess = mimetypes.guess_type(full)[0]
- if guess:
- mime = guess
-
- fsf = FilesetFile(
- path=path,
- size=size_bytes,
- md5=hashes[0].hexdigest(),
- sha1=hashes[1].hexdigest(),
- sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime),
- )
- return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
- manifest = []
- for root, dirs, files in os.walk(base_dir):
- for f in files:
- manifest.append(single_file(root, f))
- return manifest
-
-
-def cdl_dash_release(
- meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
- if not extra:
- extra = dict()
-
- assert meta["identifier"]["type"] == "DOI"
- doi = clean_doi(meta["identifier"]["value"].lower())
- assert doi and doi.startswith("10.")
-
- ark_id = None
- for extid in meta.get("alternativeIdentifiers", []):
- if extid["value"].startswith("ark:"):
- ark_id = extid["value"]
- assert ark_id
-
- license_slug = lookup_license_slug(meta["rights"]["uri"])
-
- abstracts = []
- for desc in meta["descriptions"]:
- if desc["type"] == "abstract":
- abstracts.append(
- ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
- )
- # print(abstracts)
-
- contribs = []
- for creator in meta["creator"]:
- contribs.append(
- ReleaseContrib(
- given_name=creator["given"],
- surname=creator["family"],
- # sorry everybody
- raw_name="{} {}".format(creator["given"], creator["family"]),
- raw_affiliation=creator.get("affiliation"),
- role="author", # presumably, for these datasets?
- )
- )
-
- r = ReleaseEntity(
- ext_ids=ReleaseExtIds(
- doi=doi,
- ark=ark_id,
- ),
- title=clean(meta["title"], force_xml=True),
- publisher=clean(meta["publisher"]),
- release_year=int(meta["publicationYear"]),
- release_type="dataset",
- license_slug=license_slug,
- contribs=contribs,
- abstracts=abstracts or None,
- extra=extra,
- )
- return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
- if dat_path.endswith("/"):
- dat_path = dat_path[:-1]
- dat_discovery = dat_path
- extra = dict()
- assert len(dat_discovery) == 64
-
- with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
- meta_dict = json.loads(fp.read())
-
- release = cdl_dash_release(meta_dict)
- ark_id = release.extra["ark_id"]
-
- dash_version = None
- # really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", "r") as fp:
- for line in fp:
- line = line.strip()
- if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split("<")[0])
- assert dash_version is not None
- extra["cdl_dash"] = dict(version=dash_version)
- release.extra["cdl_dash"] = dict(version=dash_version)
-
- manifest = make_manifest(dat_path + "/files/")
-
- bundle_url = dict(
- url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo-bundle",
- )
- repo_url = dict(
- url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo",
- )
- dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
- fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
- )
- return (release, fs)
-
-
-def auto_cdl_dash_dat(
- api: ApiClient,
- dat_path: str,
- release_id: Optional[str] = None,
- editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- (release, fileset) = make_release_fileset(dat_path)
-
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
- )
- )
- editgroup_id = eg.editgroup_id
-
- if not release_id and release.ext_ids.doi:
- try:
- r = api.lookup_release(doi=release.ext_ids.doi)
- release_id = r.ident
- except fatcat_openapi_client.rest.ApiException:
- pass
- if not release_id:
- edit = api.create_release(eg.editgroup_id, release)
- release_id = edit.ident
-
- release = api.get_release(release_id, expand="filesets")
- if len(release.filesets):
- print("A fileset already exists for release {}".format(release.ident))
- return (None, None, None)
-
- fileset.release_ids = [release.ident]
- edit = api.create_fileset(eg.editgroup_id, fileset)
- fileset = api.get_fileset(edit.ident)
- return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
- # pass this a discovery key that has been cloned to the local directory
- print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- EntityEdit,
- WebcaptureCdxLine,
- WebcaptureEntity,
- WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
- """Takes a wayback machine URL, and returns a tuple:
-
- (timestamp, datetime, original_url)
- """
- chunks = url.split("/")
- assert len(chunks) >= 6
- assert chunks[2] == "web.archive.org"
- assert chunks[3] == "web"
- return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
- u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
- assert parse_wbm_url(u) == (
- "20010712114837",
- datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html",
- )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
- """
- Takes a complete WBM timestamp string (like "20020327115625") and returns a
- python datetime object (UTC)
- """
- # strip any "im_" or "id_" suffix
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- # inflexible; require the full second-precision timestamp
- assert len(timestamp) == 14
- return datetime.datetime(
- year=int(timestamp[0:4]),
- month=int(timestamp[4:6]),
- day=int(timestamp[6:8]),
- hour=int(timestamp[8:10]),
- minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]),
- )
-
-
-def test_parse_wbm_timestamp() -> None:
- assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
- resp = REQ_SESSION.get(url)
- resp.raise_for_status()
- assert resp.content
- return resp.content
-
-
-def lookup_cdx(
- embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
- sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith("/web/")
- embed_url_segments = embed_url.split("/")
- timestamp = embed_url_segments[2]
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- url = "/".join(embed_url_segments[3:])
- # print((timestamp, url))
- params: Dict = dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- )
- resp = REQ_SESSION.get(
- CDX_API_BASE,
- params=params,
- )
- resp.raise_for_status()
- # print(resp.url)
- if resp.content:
- hit = resp.content.decode("utf-8").split("\n")[0]
- if cdx_output:
- cdx_output.write(hit + "\n")
- cdx_chunks = hit.split(" ")
- cdx = [x if (x and x != "-") else None for x in cdx_chunks]
- webcapture_cdx = WebcaptureCdxLine(
- surt=cdx[0],
- timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
- url=cdx[2],
- mimetype=cdx[3],
- status_code=int(cdx[4] or ""),
- sha1=b32_hex(cdx[5] or ""),
- sha256=None,
- )
- if verify_hashes:
- resp = REQ_SESSION.get(
- GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
- )
- resp.raise_for_status()
- assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
- webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
- webcapture_cdx.size = len(resp.content)
- return webcapture_cdx
- else:
- return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
- """
- Wayback URLs can be relative or absolute in rewritten documents. This
- function converts any form of rewritten URL to a relative (to
- web.archive.org) one, or returns None if it isn't a rewritten URL at all.
- """
- if url.startswith("https://web.archive.org/"):
- url = url[23:]
- elif url.startswith("http://web.archive.org/"):
- url = url[22:]
-
- if url.startswith("/web/"):
- return url
- else:
- return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
- embeds = set()
-
- # <link href="">
- for tag in soup.find_all("link", href=True):
- if tag["rel"] not in ("stylesheet",):
- continue
- url = wayback_url_to_relative(tag["href"])
- if url:
- embeds.add(url)
- # <img src="">
- for tag in soup.find_all("img", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- # <script src="">
- for tag in soup.find_all("script", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
- """
- Given a complete wayback machine capture URL, like:
-
- http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
- Will return a new ("bare") fatcat webcapture entity python object, with all
- the CDX entries filled in.
- """
-
- wbm_html = fetch_wbm(wayback_url)
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- # with open(rewritten_path, 'r') as fp:
- # soup = BeautifulSoup(fp, "lxml")
- soup = BeautifulSoup(wbm_html, "lxml")
- embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx(
- "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
- )
- cdx_list = [cdx_obj]
- for url in embeds:
- cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
- cdx_list.append(cdx_obj)
- archive_urls = [
- WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )
- ]
- wc = WebcaptureEntity(
- cdx=cdx_list,
- timestamp=timestamp.isoformat() + "Z",
- original_url=original_url,
- archive_urls=archive_urls,
- release_ids=None,
- )
- return wc
-
-
-def auto_wayback_static(
- api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
- """
- Returns a tuple: (editgroup_id, edit). If failed, both are None
- """
-
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- release = api.get_release(release_id, expand="webcaptures")
-
- # check for existing webcapture with same parameters
- for wc in release.webcaptures:
- if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
- # skipping: already existed
- print(
- "release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url
- )
- )
- return (None, None)
-
- wc = static_wayback_webcapture(wayback_url)
- assert len(wc.cdx) >= 1
- wc.release_ids = [release_id]
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
- )
- )
- editgroup_id = eg.editgroup_id
- edit = api.create_webcapture(eg.editgroup_id, wc)
- return (editgroup_id, edit)
-
-
-def main() -> None:
- parser = argparse.ArgumentParser()
- parser.add_argument("--verbose", action="store_true", help="verbose output")
- parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
- parser.add_argument(
- "--json-output",
- type=argparse.FileType("w"),
- default=sys.stdout,
- help="where to write out webcapture entity (as JSON)",
- )
- parser.add_argument(
- "--cdx-output",
- type=argparse.FileType("w"),
- default=None,
- help="(optional) file to write out CDX stub",
- )
-
- args = parser.parse_args()
-
- # entity-to-JSON code; duplicate of entity_to_dict()
- api_client = ApiClient()
- wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
- wc_dict = api_client.sanitize_for_serialization(wc)
- print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
- main()