diff options
Diffstat (limited to 'python')
-rwxr-xr-x | python/ingest_tool.py | 30 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 20 | ||||
-rwxr-xr-x | python/scripts/deliver_dumpgrobid_to_s3.py | 10 | ||||
-rwxr-xr-x | python/scripts/deliver_gwb_to_disk.py | 8 | ||||
-rwxr-xr-x | python/scripts/deliver_gwb_to_s3.py | 10 |
5 files changed, 41 insertions, 37 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py index ce3a59c..7866630 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -2,10 +2,11 @@ import argparse import json +import subprocess import sys from http.server import HTTPServer -import raven +import sentry_sdk from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker @@ -43,12 +44,6 @@ def run_single_ingest(args): def run_requests(args): - if args.enable_sentry: - try: - git_sha = raven.fetch_git_sha("..") - except Exception: - git_sha = None - sentry_client = raven.Client(release=git_sha) # noqa: # TODO: switch to using JsonLinePusher file_worker = IngestFileWorker( try_spn2=not args.no_spn2, @@ -129,6 +124,11 @@ def run_api(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--enable-sentry", + action="store_true", + help="report exceptions to Sentry", + ) subparsers = parser.add_subparsers() sub_single = subparsers.add_parser("single", help="ingests a single base URL") @@ -163,11 +163,6 @@ def main(): "--no-spn2", action="store_true", help="don't use live web (SPNv2)" ) sub_requests.add_argument( - "--enable-sentry", - action="store_true", - help="report exceptions to Sentry", - ) - sub_requests.add_argument( "--html-quick-mode", action="store_true", help="don't fetch individual sub-resources, just use CDX", @@ -214,6 +209,17 @@ def main(): parser.print_help(file=sys.stderr) sys.exit(-1) + # configure sentry *after* parsing args + if args.enable_sentry: + try: + GIT_REVISION = ( + subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") + ) + except Exception: + print("failed to configure git revision", file=sys.stderr) + GIT_REVISION = None + sentry_sdk.Client(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10) + args.func(args) diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index dd7b07f..0164943 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -7,9 +7,10 @@ or S3 (SeaweedFS). import argparse import os +import subprocess import sys -import raven +import sentry_sdk from sandcrawler import * from sandcrawler.persist import ( @@ -18,13 +19,6 @@ from sandcrawler.persist import ( PersistXmlDocWorker, ) -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -try: - git_sha = raven.fetch_git_sha("..") -except Exception: - git_sha = None -sentry_client = raven.Client(release=git_sha) - def run_grobid_extract(args): consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env) @@ -484,6 +478,16 @@ def main(): parser.print_help(file=sys.stderr) sys.exit(-1) + # configure sentry *after* parsing args + try: + GIT_REVISION = ( + subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") + ) + except Exception: + print("failed to configure git revision", file=sys.stderr) + GIT_REVISION = None + sentry_sdk.Client(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10) + args.func(args) diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 3c769cf..a08cab5 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -19,7 +19,7 @@ Output: - log to stdout (redirect to file), prefixed by sha1 Requires: -- raven (sentry) +- sentry-sdk - boto3 (AWS S3 client library) """ @@ -32,10 +32,7 @@ import sys from collections import Counter import boto3 -import raven - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +import sentry_sdk def b32_hex(s): @@ -88,7 +85,6 @@ class DeliverDumpGrobidS3: sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions def main(): parser = argparse.ArgumentParser() @@ -115,6 +111,8 @@ def main(): ) args = parser.parse_args() + sentry_client = sentry_sdk.Client() + worker = DeliverDumpGrobidS3(**args.__dict__) worker.run(args.dump_file) diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index fcaf51f..8eb39dd 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -16,14 +16,11 @@ import sys from collections import Counter from http.client import IncompleteRead -import raven +import sentry_sdk import wayback.exception from gwb.loader import CDXLoaderFactory from wayback.resourcestore import ResourceStore -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() - class DeliverGwbDisk: def __init__(self, disk_dir, **kwargs): @@ -161,7 +158,6 @@ class DeliverGwbDisk: sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions def main(): parser = argparse.ArgumentParser() @@ -191,6 +187,8 @@ def main(): ) args = parser.parse_args() + sentry_sdk.Client() + worker = DeliverGwbDisk(**args.__dict__) worker.run(args.manifest_file) diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 1f08c4f..644b134 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -24,7 +24,7 @@ Output: - log to stdout (redirect to file), prefixed by sha1 Requires: -- raven (sentry) +- sentry-sdk - boto3 (AWS S3 client library) - wayback/GWB libraries """ @@ -43,14 +43,11 @@ from collections import Counter from http.client import IncompleteRead import boto3 -import raven +import sentry_sdk import wayback.exception from gwb.loader import CDXLoaderFactory from wayback.resourcestore import ResourceStore -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() - class DeliverGwbS3: def __init__(self, s3_bucket, **kwargs): @@ -179,7 +176,6 @@ class DeliverGwbS3: sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions def main(): parser = argparse.ArgumentParser() @@ -206,6 +202,8 @@ def main(): ) args = parser.parse_args() + sentry_sdk.Client() + worker = DeliverGwbS3(**args.__dict__) worker.run(args.manifest_file) |