summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-19 19:07:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-03-19 19:07:35 -0700
commitc038294850e836c5dd24fd3dc89e77065a9d2f85 (patch)
tree3b70b6c6dd8d80d0887c29c53c0e0479c8df8142
parentd9f9a84957913f0ddd878bb079b423c059b4c81d (diff)
downloadfatcat-c038294850e836c5dd24fd3dc89e77065a9d2f85.tar.gz
fatcat-c038294850e836c5dd24fd3dc89e77065a9d2f85.zip
new importer: wayback_static
-rwxr-xr-xpython/fatcat_import.py48
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py (renamed from extra/demo_entities/static_wayback.py)41
3 files changed, 86 insertions, 4 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 8090900f..ce5063de 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -38,6 +38,36 @@ def run_grobid_metadata(args):
bezerk_mode=args.bezerk_mode)
LinePusher(fmi, args.tsv_file).run()
+def run_wayback_static(args):
+ api = args.api
+
+ # find the release
+ if args.release_id:
+ release_id = args.release_id
+ elif args.extid:
+ idtype = args.extid.split(':')[0]
+ extid = ':'.join(args.extid.split(':')[1:])
+ if idtype == "doi":
+ release_id = api.lookup_release(doi=extid).ident
+ elif idtype == "pmid":
+ release_id = api.lookup_release(pmid=extid).ident
+ elif idtype == "wikidata":
+ release_id = api.lookup_release(wikidata_qid=extid).ident
+ else:
+ raise NotImplementedError("extid type: {}".format(idtype))
+ else:
+ raise Exception("need either release_id or extid argument")
+
+ # create it
+ (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url,
+ editgroup_id=args.editgroup_id)
+ if not wc:
+ return
+ print("release_id: {}".format(release_id))
+ print("editgroup_id: {}".format(editgroup_id))
+ print("edit id: {}".format(wc.ident))
+ print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
+
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--debug',
@@ -126,6 +156,24 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
+ sub_wayback_static = subparsers.add_parser('wayback-static')
+ sub_wayback_static.set_defaults(
+ func=run_wayback_static,
+ auth_var="FATCAT_API_AUTH_TOKEN",
+ )
+ sub_wayback_static.add_argument('wayback_url',
+ type=str,
+ help="URL of wayback capture to extract from")
+ sub_wayback_static.add_argument('--extid',
+ type=str,
+ help="external identifier for release lookup")
+ sub_wayback_static.add_argument('--release-id',
+ type=str,
+ help="release entity identifier")
+ sub_wayback_static.add_argument('--editgroup-id',
+ type=str,
+ help="use existing editgroup (instead of creating a new one)")
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 70f38f5b..fe3db59d 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
+from .wayback_static import auto_wayback_static
#from .kafka_source import KafkaSource
#from .file_source import FileSource
diff --git a/extra/demo_entities/static_wayback.py b/python/fatcat_tools/importers/wayback_static.py
index feeba691..114920f7 100755
--- a/extra/demo_entities/static_wayback.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -13,6 +13,7 @@ import hashlib
import requests
import datetime
import argparse
+import subprocess
from bs4 import BeautifulSoup
from fatcat_client import *
@@ -105,7 +106,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
cdx = [x if (x and x != '-') else None for x in cdx]
webcapture_cdx = WebcaptureEntityCdx(
surt=cdx[0],
- timestamp=parse_wbm_timestamp(cdx[1]),
+ timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
url=cdx[2],
mimetype=cdx[3],
status_code=(cdx[4] and int(cdx[4])) or None,
@@ -164,18 +165,50 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
for url in embeds:
cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
cdx_list.append(cdx_obj)
- archive_urls = WebcaptureEntityArchiveUrls(
+ archive_urls = [WebcaptureEntityArchiveUrls(
rel="wayback",
url="https://web.archive.org/web/",
- )
+ )]
wc = WebcaptureEntity(
cdx=cdx_list,
- timestamp=timestamp,
+ timestamp=timestamp.isoformat() + "Z",
original_url=original_url,
archive_urls=archive_urls,
release_ids=None)
return wc
+def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
+ """
+ Returns a tuple: (editgroup_id, edit). If failed, both are None
+ """
+
+ raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
+ git_rev = subprocess.check_output(
+ ["git", "describe", "--always"]).strip().decode('utf-8')
+
+ release = api.get_release(release_id, expand="webcaptures")
+
+ # check for existing webcapture with same parameters
+ for wc in release.webcaptures:
+ if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
+ # skipping: already existed
+ print("release {} already had webcapture {} {}".format(
+ release_id, raw_timestamp, original_url))
+ return (None, None)
+
+ wc = static_wayback_webcapture(wayback_url)
+ assert len(wc.cdx) >= 1
+ wc.release_ids = [release_id]
+ if not editgroup_id:
+ eg = api.create_editgroup(Editgroup(
+ description="One-off import of static web content from wayback machine",
+ extra=dict(
+ git_rev=git_rev,
+ agent="fatcat_tools.auto_wayback_static")))
+ editgroup_id = eg.editgroup_id
+ edit = api.create_webcapture(wc, editgroup_id=editgroup_id)
+ return (editgroup_id, edit)
+
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--verbose',