diff options
| -rwxr-xr-x | python/fatcat_import.py | 48 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py (renamed from extra/demo_entities/static_wayback.py) | 41 | 
3 files changed, 86 insertions, 4 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 8090900f..ce5063de 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -38,6 +38,36 @@ def run_grobid_metadata(args):          bezerk_mode=args.bezerk_mode)      LinePusher(fmi, args.tsv_file).run() +def run_wayback_static(args): +    api = args.api + +    # find the release +    if args.release_id: +        release_id = args.release_id +    elif args.extid: +        idtype = args.extid.split(':')[0] +        extid = ':'.join(args.extid.split(':')[1:]) +        if idtype == "doi": +            release_id = api.lookup_release(doi=extid).ident +        elif idtype == "pmid": +            release_id = api.lookup_release(pmid=extid).ident +        elif idtype == "wikidata": +            release_id = api.lookup_release(wikidata_qid=extid).ident +        else: +            raise NotImplementedError("extid type: {}".format(idtype)) +    else: +        raise Exception("need either release_id or extid argument") + +    # create it +    (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url, +        editgroup_id=args.editgroup_id) +    if not wc: +        return +    print("release_id: {}".format(release_id)) +    print("editgroup_id: {}".format(editgroup_id)) +    print("edit id: {}".format(wc.ident)) +    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) +  def main():      parser = argparse.ArgumentParser()      parser.add_argument('--debug', @@ -126,6 +156,24 @@ def main():          action='store_true',          help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") +    sub_wayback_static = subparsers.add_parser('wayback-static') +    sub_wayback_static.set_defaults( +        func=run_wayback_static, +        auth_var="FATCAT_API_AUTH_TOKEN", +    ) +    sub_wayback_static.add_argument('wayback_url', +        type=str, +        help="URL of wayback capture to extract from") +    sub_wayback_static.add_argument('--extid', +        type=str, +        help="external identifier for release lookup") +    sub_wayback_static.add_argument('--release-id', +        type=str, +        help="release entity identifier") +    sub_wayback_static.add_argument('--editgroup-id', +        type=str, +        help="use existing editgroup (instead of creating a new one)") +      args = parser.parse_args()      if not args.__dict__.get("func"):          print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 70f38f5b..fe3db59d 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter  from .orcid import OrcidImporter +from .wayback_static import auto_wayback_static  #from .kafka_source import KafkaSource  #from .file_source import FileSource diff --git a/extra/demo_entities/static_wayback.py b/python/fatcat_tools/importers/wayback_static.py index feeba691..114920f7 100755 --- a/extra/demo_entities/static_wayback.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -13,6 +13,7 @@ import hashlib  import requests  import datetime  import argparse +import subprocess  from bs4 import BeautifulSoup  from fatcat_client import * @@ -105,7 +106,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):          cdx = [x if (x and x != '-') else None for x in cdx]          webcapture_cdx = WebcaptureEntityCdx(              surt=cdx[0], -            timestamp=parse_wbm_timestamp(cdx[1]), +            timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",              url=cdx[2],              mimetype=cdx[3],              status_code=(cdx[4] and int(cdx[4])) or None, @@ -164,18 +165,50 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):      for url in embeds:          cdx_obj = lookup_cdx(url, cdx_output=cdx_output)          cdx_list.append(cdx_obj) -    archive_urls = WebcaptureEntityArchiveUrls( +    archive_urls = [WebcaptureEntityArchiveUrls(          rel="wayback",          url="https://web.archive.org/web/", -    ) +    )]      wc = WebcaptureEntity(          cdx=cdx_list, -        timestamp=timestamp, +        timestamp=timestamp.isoformat() + "Z",          original_url=original_url,          archive_urls=archive_urls,          release_ids=None)      return wc +def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): +    """ +    Returns a tuple: (editgroup_id, edit). If failed, both are None +    """ + +    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) +    git_rev = subprocess.check_output( +        ["git", "describe", "--always"]).strip().decode('utf-8') + +    release = api.get_release(release_id, expand="webcaptures") + +    # check for existing webcapture with same parameters +    for wc in release.webcaptures: +        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): +            # skipping: already existed +            print("release {} already had webcapture {} {}".format( +                release_id, raw_timestamp, original_url)) +            return (None, None) + +    wc = static_wayback_webcapture(wayback_url) +    assert len(wc.cdx) >= 1 +    wc.release_ids = [release_id] +    if not editgroup_id: +        eg = api.create_editgroup(Editgroup( +            description="One-off import of static web content from wayback machine", +            extra=dict( +                git_rev=git_rev, +                agent="fatcat_tools.auto_wayback_static"))) +        editgroup_id = eg.editgroup_id +    edit = api.create_webcapture(wc, editgroup_id=editgroup_id) +    return (editgroup_id, edit) +  def main():      parser = argparse.ArgumentParser()      parser.add_argument('--verbose', | 
