aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/extraction_ungrobided.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index 8224dbb..4074112 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -32,8 +32,14 @@ from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
from common import parse_ungrobided_line
from grobid2json import teixml2json
-from extraction_cdx_grobid import KEY_BLACKLIST, sentry_client
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_BLACKLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
class MRExtractUnGrobided(MRJob):
@@ -62,9 +68,6 @@ class MRExtractUnGrobided(MRJob):
type=str,
default='https://archive.org/serve/',
help='URI where WARCs can be found')
- self.add_passthru_arg('--force-existing',
- action="store_true",
- help='Re-processes (with GROBID) existing lines')
def __init__(self, *args, **kwargs):
super(MRExtractUnGrobided, self).__init__(*args, **kwargs)