aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/ingest_tool.py30
-rwxr-xr-xpython/sandcrawler_worker.py20
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py10
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py8
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py10
5 files changed, 41 insertions, 37 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index ce3a59c..7866630 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -2,10 +2,11 @@
import argparse
import json
+import subprocess
import sys
from http.server import HTTPServer
-import raven
+import sentry_sdk
from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
@@ -43,12 +44,6 @@ def run_single_ingest(args):
def run_requests(args):
- if args.enable_sentry:
- try:
- git_sha = raven.fetch_git_sha("..")
- except Exception:
- git_sha = None
- sentry_client = raven.Client(release=git_sha) # noqa:
# TODO: switch to using JsonLinePusher
file_worker = IngestFileWorker(
try_spn2=not args.no_spn2,
@@ -129,6 +124,11 @@ def run_api(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
subparsers = parser.add_subparsers()
sub_single = subparsers.add_parser("single", help="ingests a single base URL")
@@ -163,11 +163,6 @@ def main():
"--no-spn2", action="store_true", help="don't use live web (SPNv2)"
)
sub_requests.add_argument(
- "--enable-sentry",
- action="store_true",
- help="report exceptions to Sentry",
- )
- sub_requests.add_argument(
"--html-quick-mode",
action="store_true",
help="don't fetch individual sub-resources, just use CDX",
@@ -214,6 +209,17 @@ def main():
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.Client(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index dd7b07f..0164943 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -7,9 +7,10 @@ or S3 (SeaweedFS).
import argparse
import os
+import subprocess
import sys
-import raven
+import sentry_sdk
from sandcrawler import *
from sandcrawler.persist import (
@@ -18,13 +19,6 @@ from sandcrawler.persist import (
PersistXmlDocWorker,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha("..")
-except Exception:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
-
def run_grobid_extract(args):
consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
@@ -484,6 +478,16 @@ def main():
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.Client(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 3c769cf..a08cab5 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,7 +19,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
@@ -32,10 +32,7 @@ import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -88,7 +85,6 @@ class DeliverDumpGrobidS3:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -115,6 +111,8 @@ def main():
)
args = parser.parse_args()
+ sentry_client = sentry_sdk.Client()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index fcaf51f..8eb39dd 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -16,14 +16,11 @@ import sys
from collections import Counter
from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
from gwb.loader import CDXLoaderFactory
from wayback.resourcestore import ResourceStore
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
class DeliverGwbDisk:
def __init__(self, disk_dir, **kwargs):
@@ -161,7 +158,6 @@ class DeliverGwbDisk:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -191,6 +187,8 @@ def main():
)
args = parser.parse_args()
+ sentry_sdk.Client()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 1f08c4f..644b134 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -43,14 +43,11 @@ from collections import Counter
from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
from gwb.loader import CDXLoaderFactory
from wayback.resourcestore import ResourceStore
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
class DeliverGwbS3:
def __init__(self, s3_bucket, **kwargs):
@@ -179,7 +176,6 @@ class DeliverGwbS3:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -206,6 +202,8 @@ def main():
)
args = parser.parse_args()
+ sentry_sdk.Client()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)