aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/extraction_cdx_grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index e14b925..f6c1ec1 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -19,6 +19,7 @@ Requires:
import xml
import json
+import raven
import struct
import requests
import happybase
@@ -32,6 +33,9 @@ from gwb.loader import CDXLoaderFactory
from common import parse_cdx_line
from grobid2json import teixml2json
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
class MRExtractCdxGrobid(MRJob):
@@ -74,6 +78,7 @@ class MRExtractCdxGrobid(MRJob):
if self.hb_table:
return
+ sentry_client.tags_context(dict(hbase_table=self.options.hbase_table))
try:
host = self.options.hbase_host
# TODO: make these configs accessible from... mrconf.cfg?
@@ -166,6 +171,7 @@ class MRExtractCdxGrobid(MRJob):
return info, None
+ @sentry_client.capture_exceptions
def mapper(self, _, raw_cdx):
"""
1. parse CDX line
@@ -187,6 +193,9 @@ class MRExtractCdxGrobid(MRJob):
return
key = info['key']
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+
# Check if we've already processed this line
oldrow = self.hb_table.row(key,
columns=[b'f:c', b'file', b'grobid0:status_code'])