diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat_import.py | 48 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 236 | 
3 files changed, 285 insertions, 0 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 8090900f..ce5063de 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -38,6 +38,36 @@ def run_grobid_metadata(args):          bezerk_mode=args.bezerk_mode)      LinePusher(fmi, args.tsv_file).run() +def run_wayback_static(args): +    api = args.api + +    # find the release +    if args.release_id: +        release_id = args.release_id +    elif args.extid: +        idtype = args.extid.split(':')[0] +        extid = ':'.join(args.extid.split(':')[1:]) +        if idtype == "doi": +            release_id = api.lookup_release(doi=extid).ident +        elif idtype == "pmid": +            release_id = api.lookup_release(pmid=extid).ident +        elif idtype == "wikidata": +            release_id = api.lookup_release(wikidata_qid=extid).ident +        else: +            raise NotImplementedError("extid type: {}".format(idtype)) +    else: +        raise Exception("need either release_id or extid argument") + +    # create it +    (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url, +        editgroup_id=args.editgroup_id) +    if not wc: +        return +    print("release_id: {}".format(release_id)) +    print("editgroup_id: {}".format(editgroup_id)) +    print("edit id: {}".format(wc.ident)) +    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) +  def main():      parser = argparse.ArgumentParser()      parser.add_argument('--debug', @@ -126,6 +156,24 @@ def main():          action='store_true',          help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") +    sub_wayback_static = subparsers.add_parser('wayback-static') +    sub_wayback_static.set_defaults( +        func=run_wayback_static, +        auth_var="FATCAT_API_AUTH_TOKEN", +    ) +    sub_wayback_static.add_argument('wayback_url', +        type=str, +        help="URL of wayback capture to extract from") +    sub_wayback_static.add_argument('--extid', +        type=str, +        help="external identifier for release lookup") +    sub_wayback_static.add_argument('--release-id', +        type=str, +        help="release entity identifier") +    sub_wayback_static.add_argument('--editgroup-id', +        type=str, +        help="use existing editgroup (instead of creating a new one)") +      args = parser.parse_args()      if not args.__dict__.get("func"):          print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 70f38f5b..fe3db59d 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter  from .orcid import OrcidImporter +from .wayback_static import auto_wayback_static  #from .kafka_source import KafkaSource  #from .file_source import FileSource diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py new file mode 100755 index 00000000..114920f7 --- /dev/null +++ b/python/fatcat_tools/importers/wayback_static.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +""" +Helpers to create Web Capture entities from extracted wayback content. + +Works as a stand-alone script (for debugging) or as library routines. +""" + +import sys +import json +import base64 +import hashlib +import requests +import datetime +import argparse +import subprocess +from bs4 import BeautifulSoup + +from fatcat_client import * + +CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" +GWB_URL_BASE = "https://web.archive.org/web" +REQ_SESSION = requests.Session() + + +def b32_hex(s): +    """copy/pasta from elsewhere""" +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        return s +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def parse_wbm_url(url): +    """Takes a wayback machine URL, and returns a tuple: + +        (timestamp, datetime, original_url) +    """ +    chunks = url.split('/') +    assert len(chunks) >= 6 +    assert chunks[2] == 'web.archive.org' +    assert chunks[3] == 'web' +    return (chunks[4], +            parse_wbm_timestamp(chunks[4]), +            '/'.join(chunks[5:])) + +def test_parse_wbm_url(): +    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" +    assert parse_wbm_url(u) == ( +        "20010712114837", +        datetime.datetime(2001, 7, 12, 11, 48, 37), +        "http://www.dlib.org/dlib/june01/reich/06reich.html") + +def parse_wbm_timestamp(timestamp): +    """ +    Takes a complete WBM timestamp string (like "20020327115625") and returns a +    python datetime object (UTC) +    """ +    # strip any "im_" or "id_" suffix +    if timestamp.endswith('_'): +        timestamp = timestamp[:-3] +    # inflexible; require the full second-precision timestamp +    assert len(timestamp) == 14 +    return datetime.datetime( +        year=int(timestamp[0:4]), +        month=int(timestamp[4:6]), +        day=int(timestamp[6:8]), +        hour=int(timestamp[8:10]), +        minute=int(timestamp[10:12]), +        second=int(timestamp[12:14])) + +def test_parse_wbm_timestamp(): +    assert parse_wbm_timestamp("20010712114837") == \ +        datetime.datetime(2001, 7, 12, 11, 48, 37) + +def fetch_wbm(url): +    resp = REQ_SESSION.get(url) +    resp.raise_for_status() +    assert resp.content +    return resp.content + +def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): +    assert embed_url.startswith('/web/') +    embed_url = embed_url.split('/') +    timestamp = embed_url[2] +    if timestamp.endswith('_'): +        timestamp = timestamp[:-3] +    url = '/'.join(embed_url[3:]) +    #print((timestamp, url)) +    resp = REQ_SESSION.get(CDX_API_BASE, params=dict( +        url=url, +        closest=timestamp, +        sort="closest", +        resolveRevisits="true", +        matchType="exact", +        limit=1, +    )) +    resp.raise_for_status() +    #print(resp.url) +    if resp.content: +        hit = resp.content.decode('utf-8').split('\n')[0] +        if cdx_output: +            cdx_output.write(hit + "\n") +        cdx = hit.split(' ') +        cdx = [x if (x and x != '-') else None for x in cdx] +        webcapture_cdx = WebcaptureEntityCdx( +            surt=cdx[0], +            timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", +            url=cdx[2], +            mimetype=cdx[3], +            status_code=(cdx[4] and int(cdx[4])) or None, +            sha1=b32_hex(cdx[5]), +            sha256=None, +        ) +        if verify_hashes: +            resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( +                cdx[1], # raw timestamp +                webcapture_cdx.url)) +            resp.raise_for_status() +            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() +            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() +        return webcapture_cdx +    else: +        return None + +def extract_embeds(soup): + +    embeds = set() + +    # <link href=""> +    for tag in soup.find_all('link', href=True): +        if tag['href'].startswith('/web/'): +            embeds.add(tag['href']) +    # <img src=""> +    for tag in soup.find_all('img', src=True): +        if tag['src'].startswith('/web/'): +            embeds.add(tag['src']) +    # <script src=""> +    for tag in soup.find_all('script', src=True): +        if tag['src'].startswith('/web/'): +            embeds.add(tag['src']) + +    return list(embeds) + +def static_wayback_webcapture(wayback_url, cdx_output=None): +    """ +    Given a complete wayback machine capture URL, like: + +        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html + +    Will return a new ("bare") fatcat webcapture entity python object, with all +    the CDX entries filled in. +    """ + +    wbm_html = fetch_wbm(wayback_url) +    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) +    #with open(rewritten_path, 'r') as fp: +    #    soup = BeautifulSoup(fp, "lxml") +    soup = BeautifulSoup(wbm_html, "lxml") +    embeds = extract_embeds(soup) +    cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), +        cdx_output=cdx_output) +    cdx_list = [cdx_obj] +    for url in embeds: +        cdx_obj = lookup_cdx(url, cdx_output=cdx_output) +        cdx_list.append(cdx_obj) +    archive_urls = [WebcaptureEntityArchiveUrls( +        rel="wayback", +        url="https://web.archive.org/web/", +    )] +    wc = WebcaptureEntity( +        cdx=cdx_list, +        timestamp=timestamp.isoformat() + "Z", +        original_url=original_url, +        archive_urls=archive_urls, +        release_ids=None) +    return wc + +def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): +    """ +    Returns a tuple: (editgroup_id, edit). If failed, both are None +    """ + +    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) +    git_rev = subprocess.check_output( +        ["git", "describe", "--always"]).strip().decode('utf-8') + +    release = api.get_release(release_id, expand="webcaptures") + +    # check for existing webcapture with same parameters +    for wc in release.webcaptures: +        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): +            # skipping: already existed +            print("release {} already had webcapture {} {}".format( +                release_id, raw_timestamp, original_url)) +            return (None, None) + +    wc = static_wayback_webcapture(wayback_url) +    assert len(wc.cdx) >= 1 +    wc.release_ids = [release_id] +    if not editgroup_id: +        eg = api.create_editgroup(Editgroup( +            description="One-off import of static web content from wayback machine", +            extra=dict( +                git_rev=git_rev, +                agent="fatcat_tools.auto_wayback_static"))) +        editgroup_id = eg.editgroup_id +    edit = api.create_webcapture(wc, editgroup_id=editgroup_id) +    return (editgroup_id, edit) + +def main(): +    parser = argparse.ArgumentParser() +    parser.add_argument('--verbose', +        action='store_true', +        help="verbose output") +    parser.add_argument('wayback_url', +        type=str, +        help="URL of wayback capture to extract from") +    parser.add_argument('--json-output', +        type=argparse.FileType('w'), default=sys.stdout, +        help="where to write out webcapture entity (as JSON)") +    parser.add_argument('--cdx-output', +        type=argparse.FileType('w'), default=None, +        help="(optional) file to write out CDX stub") + +    args = parser.parse_args() + +    # entity-to-JSON code; duplicate of entity_to_dict() +    api_client = ApiClient() +    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) +    wc_dict = api_client.sanitize_for_serialization(wc) +    print(json.dumps(wc_dict)) + +if __name__ == '__main__': +    main() | 
