aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mapreduce/Pipfile1
-rw-r--r--mapreduce/Pipfile.lock35
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py9
-rw-r--r--mapreduce/mrjob.conf5
4 files changed, 31 insertions, 19 deletions
diff --git a/mapreduce/Pipfile b/mapreduce/Pipfile
index 81cdea1..129b23e 100644
--- a/mapreduce/Pipfile
+++ b/mapreduce/Pipfile
@@ -24,6 +24,7 @@ mrjob = "*"
requests = "*"
wayback = {version=">=0.2.1.2", index="ia"}
xmltodict = "*"
+raven = "*"
[requires]
python_version = "3.5"
diff --git a/mapreduce/Pipfile.lock b/mapreduce/Pipfile.lock
index 19cc6d4..9dfd3c9 100644
--- a/mapreduce/Pipfile.lock
+++ b/mapreduce/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "a47b9c2fe50784191c651423b9c10f89b56bed64d9e37c4e4b60be747bef30b6"
+ "sha256": "d28f89355ce6520af7e275ad7bbc944acff5946d3701d3336bb268f4a5e82980"
},
"pipfile-spec": 6,
"requires": {
@@ -23,17 +23,17 @@
"default": {
"boto3": {
"hashes": [
- "sha256:4c746d55fb6294c11e78db76648c7f321f8068dc5ab402456b5e3e4ee52423b9",
- "sha256:5c02372447e07bd67f7c19624642dcf5c7aabe126e1c44ff4ad241aeb97fc729"
+ "sha256:13ad5f64a247d655a27dca83274588e9d14cba61b38d3d4fd2b011b7197d88dd",
+ "sha256:a56b21efbc994580fc9cef454f0f949745c152326f939aed6609d1c47b2a0f8f"
],
- "version": "==1.7.2"
+ "version": "==1.7.4"
},
"botocore": {
"hashes": [
- "sha256:27945af4bfb2a1ff1f11c730d24b84da6e1f40465907029e8980903f3b984070",
- "sha256:8ded801591ef5df04244dc1ba2496dd04a9abbd165d0d2ad501b6cd4b34946d4"
+ "sha256:5602738392ecde5c02a06a3b02de07171f440a44cdfef0aadff4b59567359607",
+ "sha256:77f2869b8c27afbab78b72ce6b74c75923421f364c7a0153ac1a38858c59cd91"
],
- "version": "==1.10.2"
+ "version": "==1.10.4"
},
"cachetools": {
"hashes": [
@@ -628,17 +628,10 @@
},
"pytz": {
"hashes": [
- "sha256:07edfc3d4d2705a20a6e99d97f0c4b61c800b8232dc1c04d87e8554f130148dd",
- "sha256:3a47ff71597f821cd84a162e71593004286e5be07a340fd462f0d33a760782b5",
- "sha256:410bcd1d6409026fbaa65d9ed33bf6dd8b1e94a499e32168acfc7b332e4095c0",
- "sha256:5bd55c744e6feaa4d599a6cbd8228b4f8f9ba96de2c38d56f08e534b3c9edf0d",
- "sha256:61242a9abc626379574a166dc0e96a66cd7c3b27fc10868003fa210be4bff1c9",
- "sha256:887ab5e5b32e4d0c86efddd3d055c1f363cbaa583beb8da5e22d2fa2f64d51ef",
- "sha256:ba18e6a243b3625513d85239b3e49055a2f0318466e0b8a92b8fb8ca7ccdf55f",
- "sha256:ed6509d9af298b7995d69a440e2822288f2eca1681b8cce37673dbb10091e5fe",
- "sha256:f93ddcdd6342f94cea379c73cddb5724e0d6d0a1c91c9bdef364dc0368ba4fda"
+ "sha256:65ae0c8101309c45772196b21b74c46b2e5d11b6275c45d251b150d5da334555",
+ "sha256:c06425302f2cf668f1bba7a0a03f3c1d34d4ebeef2c72003da308b3947c7f749"
],
- "version": "==2018.3"
+ "version": "==2018.4"
},
"pyyaml": {
"hashes": [
@@ -660,6 +653,14 @@
],
"version": "==3.12"
},
+ "raven": {
+ "hashes": [
+ "sha256:738a52019d01955d5b44b49d67c9f2f4cedb1b4f70d4fb0b493931174d00e044",
+ "sha256:92bf4c4819472ed20f1b9905eeeafe1bc6fe5f273d7c14506fdb8fb3a6ab2074"
+ ],
+ "index": "ia",
+ "version": "==6.6.0"
+ },
"redis": {
"hashes": [
"sha256:8a1900a9f2a0a44ecf6e8b5eb3e967a9909dfed219ad66df094f27f7d6f330fb",
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 0812884..2515cba 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -19,6 +19,7 @@ Requires:
import xml
import json
+import raven
import struct
import requests
import happybase
@@ -32,6 +33,9 @@ from gwb.loader import CDXLoaderFactory
from common import parse_cdx_line
from grobid2json import teixml2json
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
class MRExtractCdxGrobid(MRJob):
@@ -74,6 +78,7 @@ class MRExtractCdxGrobid(MRJob):
if self.hb_table:
return
+ sentry_client.tags_context(dict(hbase_table=self.options.hbase_table))
try:
host = self.options.hbase_host
# TODO: make these configs accessible from... mrconf.cfg?
@@ -166,6 +171,7 @@ class MRExtractCdxGrobid(MRJob):
return info, None
+ @sentry_client.capture_exceptions
def mapper(self, _, raw_cdx):
"""
1. parse CDX line
@@ -187,6 +193,9 @@ class MRExtractCdxGrobid(MRJob):
return
key = info['key']
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+
# Check if we've already processed this line
oldrow = self.hb_table.row(key,
columns=[b'f:c', b'file', b'grobid0:status_code'])
diff --git a/mapreduce/mrjob.conf b/mapreduce/mrjob.conf
index 66724cb..8bf1cdd 100644
--- a/mapreduce/mrjob.conf
+++ b/mapreduce/mrjob.conf
@@ -4,5 +4,6 @@ runners:
upload_files:
- common.py
- grobid2json.py
- setup:
- - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/
+ cmdenv:
+ PYTHONPATH: venv/lib/python3.5/site-packages/
+ SENTRY_DSN: https://63e8a85bbb8948ea9652ee280aa4dee0:f1639d58707c48a0bbc6a14623ba6a47@sentry.io/1186924