diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-30 19:05:13 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-30 19:05:15 -0800 |
commit | e86414a774d4a0c6db62110f04a6c96365afbf6c (patch) | |
tree | b6d762223017dbd12f2b10a0242548702295072d | |
parent | 64e4a0b3c43b53f5c79e6f17a189c7f5ec113c5b (diff) | |
download | fatcat-scholar-e86414a774d4a0c6db62110f04a6c96365afbf6c.tar.gz fatcat-scholar-e86414a774d4a0c6db62110f04a6c96365afbf6c.zip |
enable sentry exceptions for workers and pipelines
It is otherwise difficult to debug multi-million record pipelines.
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 10 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 13 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 11 | ||||
-rw-r--r-- | fatcat_scholar/worker.py | 11 |
4 files changed, 42 insertions, 3 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index 621f1fc..8d67762 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -5,8 +5,10 @@ import argparse from typing import List, Dict, Optional, Any import requests +import sentry_sdk import internetarchive +from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.djvu import djvu_extract_leaf_texts from fatcat_scholar.issue_db import IssueDB from fatcat_scholar.schema import ( @@ -254,6 +256,14 @@ def main() -> None: parser.print_help(file=sys.stderr) sys.exit(-1) + if settings.SENTRY_DSN: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + environment=settings.SCHOLAR_ENV, + max_breadcrumbs=10, + release=GIT_REVISION, + ) + sp = SimPipeline(issue_db=IssueDB(args.issue_db_file)) if args.func == "run_issue_db": diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 18410c3..a2cdcb9 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -4,11 +4,12 @@ import datetime import xml.etree.ElementTree as ET from typing import List, Dict, Optional, Any, Sequence +import sentry_sdk from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity from fatcat_scholar.api_entities import * from fatcat_scholar.schema import * -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.grobid2json import teixml2json @@ -798,6 +799,16 @@ def main() -> None: parser.print_help(file=sys.stderr) sys.exit(-1) + # enable sentry exception catching; this helps a lot with debugging bulk + # transform runs + if settings.SENTRY_DSN: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + environment=settings.SCHOLAR_ENV, + max_breadcrumbs=10, + release=GIT_REVISION, + ) + if args.func == "run_transform": run_transform(infile=args.json_file) elif args.func == "run_refs": diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index aef2064..cb96274 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -7,11 +7,12 @@ import urllib3.exceptions import minio import requests +import sentry_sdk import internetarchive from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity from fatcat_scholar.api_entities import * -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.djvu import djvu_extract_leaf_texts from fatcat_scholar.sandcrawler import ( SandcrawlerPostgrestClient, @@ -469,6 +470,14 @@ def main() -> None: parser.print_help(file=sys.stderr) sys.exit(-1) + if settings.SENTRY_DSN: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + environment=settings.SCHOLAR_ENV, + max_breadcrumbs=10, + release=GIT_REVISION, + ) + wp = WorkPipeline( issue_db=IssueDB(args.issue_db_file), sandcrawler_db_client=SandcrawlerPostgrestClient( diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index fb806a7..854c1a2 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -6,12 +6,13 @@ import datetime from typing import List, Any import requests +import sentry_sdk import elasticsearch import elasticsearch.helpers import fatcat_openapi_client from fatcat_openapi_client import ReleaseEntity -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.issue_db import IssueDB from fatcat_scholar.sandcrawler import ( SandcrawlerPostgrestClient, @@ -198,6 +199,14 @@ def main() -> None: parser.print_help(file=sys.stderr) sys.exit(-1) + if settings.SENTRY_DSN: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + environment=settings.SCHOLAR_ENV, + max_breadcrumbs=10, + release=GIT_REVISION, + ) + if args.worker == "fetch-docs-worker": issue_db = IssueDB(args.issue_db_file) wp = WorkPipeline( |