aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler_worker.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-xpython/sandcrawler_worker.py29
1 files changed, 19 insertions, 10 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 482dc33..aebcbe1 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -7,9 +7,10 @@ or S3 (SeaweedFS).
import argparse
import os
+import subprocess
import sys
-import raven
+import sentry_sdk
from sandcrawler import *
from sandcrawler.persist import (
@@ -18,13 +19,6 @@ from sandcrawler.persist import (
PersistXmlDocWorker,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha("..")
-except Exception:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
-
def run_grobid_extract(args):
consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
@@ -278,8 +272,8 @@ def run_ingest_file(args):
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
htmlteixml_sink=htmlteixml_sink,
- # don't SPNv2 for --bulk backfill
- try_spn2=not args.bulk,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
spn_cdx_retry_sec=spn_cdx_retry_sec,
)
pusher = KafkaJsonPusher(
@@ -448,6 +442,11 @@ def main():
help="consume from bulk kafka topic (eg, for ingest backfill)",
)
sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
"--priority",
action="store_true",
help="consume from priority kafka topic (eg, for SPN requests)",
@@ -479,6 +478,16 @@ def main():
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)