diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-19 19:07:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-19 19:07:35 -0700 |
commit | c038294850e836c5dd24fd3dc89e77065a9d2f85 (patch) | |
tree | 3b70b6c6dd8d80d0887c29c53c0e0479c8df8142 | |
parent | d9f9a84957913f0ddd878bb079b423c059b4c81d (diff) | |
download | fatcat-c038294850e836c5dd24fd3dc89e77065a9d2f85.tar.gz fatcat-c038294850e836c5dd24fd3dc89e77065a9d2f85.zip |
new importer: wayback_static
-rwxr-xr-x | python/fatcat_import.py | 48 | ||||
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py (renamed from extra/demo_entities/static_wayback.py) | 41 |
3 files changed, 86 insertions, 4 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 8090900f..ce5063de 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -38,6 +38,36 @@ def run_grobid_metadata(args): bezerk_mode=args.bezerk_mode) LinePusher(fmi, args.tsv_file).run() +def run_wayback_static(args): + api = args.api + + # find the release + if args.release_id: + release_id = args.release_id + elif args.extid: + idtype = args.extid.split(':')[0] + extid = ':'.join(args.extid.split(':')[1:]) + if idtype == "doi": + release_id = api.lookup_release(doi=extid).ident + elif idtype == "pmid": + release_id = api.lookup_release(pmid=extid).ident + elif idtype == "wikidata": + release_id = api.lookup_release(wikidata_qid=extid).ident + else: + raise NotImplementedError("extid type: {}".format(idtype)) + else: + raise Exception("need either release_id or extid argument") + + # create it + (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url, + editgroup_id=args.editgroup_id) + if not wc: + return + print("release_id: {}".format(release_id)) + print("editgroup_id: {}".format(editgroup_id)) + print("edit id: {}".format(wc.ident)) + print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) + def main(): parser = argparse.ArgumentParser() parser.add_argument('--debug', @@ -126,6 +156,24 @@ def main(): action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") + sub_wayback_static = subparsers.add_parser('wayback-static') + sub_wayback_static.set_defaults( + func=run_wayback_static, + auth_var="FATCAT_API_AUTH_TOKEN", + ) + sub_wayback_static.add_argument('wayback_url', + type=str, + help="URL of wayback capture to extract from") + sub_wayback_static.add_argument('--extid', + type=str, + help="external identifier for release lookup") + sub_wayback_static.add_argument('--release-id', + type=str, + help="release entity identifier") + sub_wayback_static.add_argument('--editgroup-id', + type=str, + help="use existing editgroup (instead of creating a new one)") + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 70f38f5b..fe3db59d 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +from .wayback_static import auto_wayback_static #from .kafka_source import KafkaSource #from .file_source import FileSource diff --git a/extra/demo_entities/static_wayback.py b/python/fatcat_tools/importers/wayback_static.py index feeba691..114920f7 100755 --- a/extra/demo_entities/static_wayback.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -13,6 +13,7 @@ import hashlib import requests import datetime import argparse +import subprocess from bs4 import BeautifulSoup from fatcat_client import * @@ -105,7 +106,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): cdx = [x if (x and x != '-') else None for x in cdx] webcapture_cdx = WebcaptureEntityCdx( surt=cdx[0], - timestamp=parse_wbm_timestamp(cdx[1]), + timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", url=cdx[2], mimetype=cdx[3], status_code=(cdx[4] and int(cdx[4])) or None, @@ -164,18 +165,50 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): for url in embeds: cdx_obj = lookup_cdx(url, cdx_output=cdx_output) cdx_list.append(cdx_obj) - archive_urls = WebcaptureEntityArchiveUrls( + archive_urls = [WebcaptureEntityArchiveUrls( rel="wayback", url="https://web.archive.org/web/", - ) + )] wc = WebcaptureEntity( cdx=cdx_list, - timestamp=timestamp, + timestamp=timestamp.isoformat() + "Z", original_url=original_url, archive_urls=archive_urls, release_ids=None) return wc +def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): + """ + Returns a tuple: (editgroup_id, edit). If failed, both are None + """ + + raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) + git_rev = subprocess.check_output( + ["git", "describe", "--always"]).strip().decode('utf-8') + + release = api.get_release(release_id, expand="webcaptures") + + # check for existing webcapture with same parameters + for wc in release.webcaptures: + if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): + # skipping: already existed + print("release {} already had webcapture {} {}".format( + release_id, raw_timestamp, original_url)) + return (None, None) + + wc = static_wayback_webcapture(wayback_url) + assert len(wc.cdx) >= 1 + wc.release_ids = [release_id] + if not editgroup_id: + eg = api.create_editgroup(Editgroup( + description="One-off import of static web content from wayback machine", + extra=dict( + git_rev=git_rev, + agent="fatcat_tools.auto_wayback_static"))) + editgroup_id = eg.editgroup_id + edit = api.create_webcapture(wc, editgroup_id=editgroup_id) + return (editgroup_id, edit) + def main(): parser = argparse.ArgumentParser() parser.add_argument('--verbose', |