diff options
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 29 |
1 files changed, 19 insertions, 10 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 482dc33..aebcbe1 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -7,9 +7,10 @@ or S3 (SeaweedFS). import argparse import os +import subprocess import sys -import raven +import sentry_sdk from sandcrawler import * from sandcrawler.persist import ( @@ -18,13 +19,6 @@ from sandcrawler.persist import ( PersistXmlDocWorker, ) -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -try: - git_sha = raven.fetch_git_sha("..") -except Exception: - git_sha = None -sentry_client = raven.Client(release=git_sha) - def run_grobid_extract(args): consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env) @@ -278,8 +272,8 @@ def run_ingest_file(args): pdftext_sink=pdftext_sink, xmldoc_sink=xmldoc_sink, htmlteixml_sink=htmlteixml_sink, - # don't SPNv2 for --bulk backfill - try_spn2=not args.bulk, + # don't SPNv2 for --bulk or --skip-spn + try_spn2=not (args.bulk or args.skip_spn), spn_cdx_retry_sec=spn_cdx_retry_sec, ) pusher = KafkaJsonPusher( @@ -448,6 +442,11 @@ def main(): help="consume from bulk kafka topic (eg, for ingest backfill)", ) sub_ingest_file.add_argument( + "--skip-spn", + action="store_true", + help="don't do SPN lookups", + ) + sub_ingest_file.add_argument( "--priority", action="store_true", help="consume from priority kafka topic (eg, for SPN requests)", @@ -479,6 +478,16 @@ def main(): parser.print_help(file=sys.stderr) sys.exit(-1) + # configure sentry *after* parsing args + try: + GIT_REVISION = ( + subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") + ) + except Exception: + print("failed to configure git revision", file=sys.stderr) + GIT_REVISION = None + sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10) + args.func(args) |