diff options
Diffstat (limited to 'python/ingest_tool.py')
-rwxr-xr-x | python/ingest_tool.py | 63 |
1 files changed, 43 insertions, 20 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py index ce3a59c..0b74f9f 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -2,10 +2,11 @@ import argparse import json +import subprocess import sys from http.server import HTTPServer -import raven +import sentry_sdk from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker @@ -43,12 +44,6 @@ def run_single_ingest(args): def run_requests(args): - if args.enable_sentry: - try: - git_sha = raven.fetch_git_sha("..") - except Exception: - git_sha = None - sentry_client = raven.Client(release=git_sha) # noqa: # TODO: switch to using JsonLinePusher file_worker = IngestFileWorker( try_spn2=not args.no_spn2, @@ -75,11 +70,11 @@ def run_file_requests_backfill(args): Can be used to batch re-process known files. """ - grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env) - pdftext_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env) - thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env) - xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.kafka_env) - htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.kafka_env) + grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) + pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env) + thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) + xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env) + htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env) grobid_sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=grobid_topic, @@ -120,6 +115,20 @@ def run_file_requests_backfill(args): pusher.run() +def run_spn_status(args): + worker = IngestFileWorker( + sink=None, + try_spn2=False, + ) + + resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system") + resp.raise_for_status() + print(f"System status: {json.dumps(resp.json(), sort_keys=True)}") + resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user") + resp.raise_for_status() + print(f"User status: {json.dumps(resp.json(), sort_keys=True)}") + + def run_api(args): port = 8083 print("Listening on localhost:{}".format(port)) @@ -129,6 +138,12 @@ def run_api(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--enable-sentry", + action="store_true", + help="report exceptions to Sentry", + ) + parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)") subparsers = parser.add_subparsers() sub_single = subparsers.add_parser("single", help="ingests a single base URL") @@ -163,11 +178,6 @@ def main(): "--no-spn2", action="store_true", help="don't use live web (SPNv2)" ) sub_requests.add_argument( - "--enable-sentry", - action="store_true", - help="report exceptions to Sentry", - ) - sub_requests.add_argument( "--html-quick-mode", action="store_true", help="don't fetch individual sub-resources, just use CDX", @@ -203,17 +213,30 @@ def main(): help="list of Kafka brokers (host/port) to use", ) sub_file_requests_backfill.add_argument( - "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" - ) - sub_file_requests_backfill.add_argument( "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" ) + sub_spn_status = subparsers.add_parser( + "spn-status", help="checks save-page-now v2 API status for bot user" + ) + sub_spn_status.set_defaults(func=run_spn_status) + args = parser.parse_args() if not args.__dict__.get("func"): parser.print_help(file=sys.stderr) sys.exit(-1) + # configure sentry *after* parsing args + if args.enable_sentry: + try: + GIT_REVISION = ( + subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") + ) + except Exception: + print("failed to configure git revision", file=sys.stderr) + GIT_REVISION = None + sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10) + args.func(args) |