diff options
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index e14b925..f6c1ec1 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -19,6 +19,7 @@ Requires: import xml import json +import raven import struct import requests import happybase @@ -32,6 +33,9 @@ from gwb.loader import CDXLoaderFactory from common import parse_cdx_line from grobid2json import teixml2json +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + class MRExtractCdxGrobid(MRJob): @@ -74,6 +78,7 @@ class MRExtractCdxGrobid(MRJob): if self.hb_table: return + sentry_client.tags_context(dict(hbase_table=self.options.hbase_table)) try: host = self.options.hbase_host # TODO: make these configs accessible from... mrconf.cfg? @@ -166,6 +171,7 @@ class MRExtractCdxGrobid(MRJob): return info, None + @sentry_client.capture_exceptions def mapper(self, _, raw_cdx): """ 1. parse CDX line @@ -187,6 +193,9 @@ class MRExtractCdxGrobid(MRJob): return key = info['key'] + # Note: this may not get "cleared" correctly + sentry_client.extra_context(dict(row_key=key)) + # Check if we've already processed this line oldrow = self.hb_table.row(key, columns=[b'f:c', b'file', b'grobid0:status_code']) |