diff options
-rw-r--r-- | mapreduce/Pipfile | 1 | ||||
-rw-r--r-- | mapreduce/Pipfile.lock | 35 | ||||
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 9 | ||||
-rw-r--r-- | mapreduce/mrjob.conf | 5 |
4 files changed, 31 insertions, 19 deletions
diff --git a/mapreduce/Pipfile b/mapreduce/Pipfile index 81cdea1..129b23e 100644 --- a/mapreduce/Pipfile +++ b/mapreduce/Pipfile @@ -24,6 +24,7 @@ mrjob = "*" requests = "*" wayback = {version=">=0.2.1.2", index="ia"} xmltodict = "*" +raven = "*" [requires] python_version = "3.5" diff --git a/mapreduce/Pipfile.lock b/mapreduce/Pipfile.lock index 19cc6d4..9dfd3c9 100644 --- a/mapreduce/Pipfile.lock +++ b/mapreduce/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "a47b9c2fe50784191c651423b9c10f89b56bed64d9e37c4e4b60be747bef30b6" + "sha256": "d28f89355ce6520af7e275ad7bbc944acff5946d3701d3336bb268f4a5e82980" }, "pipfile-spec": 6, "requires": { @@ -23,17 +23,17 @@ "default": { "boto3": { "hashes": [ - "sha256:4c746d55fb6294c11e78db76648c7f321f8068dc5ab402456b5e3e4ee52423b9", - "sha256:5c02372447e07bd67f7c19624642dcf5c7aabe126e1c44ff4ad241aeb97fc729" + "sha256:13ad5f64a247d655a27dca83274588e9d14cba61b38d3d4fd2b011b7197d88dd", + "sha256:a56b21efbc994580fc9cef454f0f949745c152326f939aed6609d1c47b2a0f8f" ], - "version": "==1.7.2" + "version": "==1.7.4" }, "botocore": { "hashes": [ - "sha256:27945af4bfb2a1ff1f11c730d24b84da6e1f40465907029e8980903f3b984070", - "sha256:8ded801591ef5df04244dc1ba2496dd04a9abbd165d0d2ad501b6cd4b34946d4" + "sha256:5602738392ecde5c02a06a3b02de07171f440a44cdfef0aadff4b59567359607", + "sha256:77f2869b8c27afbab78b72ce6b74c75923421f364c7a0153ac1a38858c59cd91" ], - "version": "==1.10.2" + "version": "==1.10.4" }, "cachetools": { "hashes": [ @@ -628,17 +628,10 @@ }, "pytz": { "hashes": [ - "sha256:07edfc3d4d2705a20a6e99d97f0c4b61c800b8232dc1c04d87e8554f130148dd", - "sha256:3a47ff71597f821cd84a162e71593004286e5be07a340fd462f0d33a760782b5", - "sha256:410bcd1d6409026fbaa65d9ed33bf6dd8b1e94a499e32168acfc7b332e4095c0", - "sha256:5bd55c744e6feaa4d599a6cbd8228b4f8f9ba96de2c38d56f08e534b3c9edf0d", - "sha256:61242a9abc626379574a166dc0e96a66cd7c3b27fc10868003fa210be4bff1c9", - "sha256:887ab5e5b32e4d0c86efddd3d055c1f363cbaa583beb8da5e22d2fa2f64d51ef", - "sha256:ba18e6a243b3625513d85239b3e49055a2f0318466e0b8a92b8fb8ca7ccdf55f", - "sha256:ed6509d9af298b7995d69a440e2822288f2eca1681b8cce37673dbb10091e5fe", - "sha256:f93ddcdd6342f94cea379c73cddb5724e0d6d0a1c91c9bdef364dc0368ba4fda" + "sha256:65ae0c8101309c45772196b21b74c46b2e5d11b6275c45d251b150d5da334555", + "sha256:c06425302f2cf668f1bba7a0a03f3c1d34d4ebeef2c72003da308b3947c7f749" ], - "version": "==2018.3" + "version": "==2018.4" }, "pyyaml": { "hashes": [ @@ -660,6 +653,14 @@ ], "version": "==3.12" }, + "raven": { + "hashes": [ + "sha256:738a52019d01955d5b44b49d67c9f2f4cedb1b4f70d4fb0b493931174d00e044", + "sha256:92bf4c4819472ed20f1b9905eeeafe1bc6fe5f273d7c14506fdb8fb3a6ab2074" + ], + "index": "ia", + "version": "==6.6.0" + }, "redis": { "hashes": [ "sha256:8a1900a9f2a0a44ecf6e8b5eb3e967a9909dfed219ad66df094f27f7d6f330fb", diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 0812884..2515cba 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -19,6 +19,7 @@ Requires: import xml import json +import raven import struct import requests import happybase @@ -32,6 +33,9 @@ from gwb.loader import CDXLoaderFactory from common import parse_cdx_line from grobid2json import teixml2json +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + class MRExtractCdxGrobid(MRJob): @@ -74,6 +78,7 @@ class MRExtractCdxGrobid(MRJob): if self.hb_table: return + sentry_client.tags_context(dict(hbase_table=self.options.hbase_table)) try: host = self.options.hbase_host # TODO: make these configs accessible from... mrconf.cfg? @@ -166,6 +171,7 @@ class MRExtractCdxGrobid(MRJob): return info, None + @sentry_client.capture_exceptions def mapper(self, _, raw_cdx): """ 1. parse CDX line @@ -187,6 +193,9 @@ class MRExtractCdxGrobid(MRJob): return key = info['key'] + # Note: this may not get "cleared" correctly + sentry_client.extra_context(dict(row_key=key)) + # Check if we've already processed this line oldrow = self.hb_table.row(key, columns=[b'f:c', b'file', b'grobid0:status_code']) diff --git a/mapreduce/mrjob.conf b/mapreduce/mrjob.conf index 66724cb..8bf1cdd 100644 --- a/mapreduce/mrjob.conf +++ b/mapreduce/mrjob.conf @@ -4,5 +4,6 @@ runners: upload_files: - common.py - grobid2json.py - setup: - - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/ + cmdenv: + PYTHONPATH: venv/lib/python3.5/site-packages/ + SENTRY_DSN: https://63e8a85bbb8948ea9652ee280aa4dee0:f1639d58707c48a0bbc6a14623ba6a47@sentry.io/1186924 |