From d2b4da4c55a24468a0cbfdc9f567449d4e913331 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 26 Aug 2018 17:00:06 +0000 Subject: finally got extraction_ungrobided to run in prod Problem was that only one python script was getting sent, so couldn't "import from". Should refactor shared code into a new common file. --- python/extraction_ungrobided.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'python/extraction_ungrobided.py') diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 8224dbb..4074112 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -32,8 +32,14 @@ from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory from common import parse_ungrobided_line from grobid2json import teixml2json -from extraction_cdx_grobid import KEY_BLACKLIST, sentry_client +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + +# Specific poison-pill rows we should skip +KEY_BLACKLIST = ( + 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format" +) class MRExtractUnGrobided(MRJob): @@ -62,9 +68,6 @@ class MRExtractUnGrobided(MRJob): type=str, default='https://archive.org/serve/', help='URI where WARCs can be found') - self.add_passthru_arg('--force-existing', - action="store_true", - help='Re-processes (with GROBID) existing lines') def __init__(self, *args, **kwargs): super(MRExtractUnGrobided, self).__init__(*args, **kwargs) -- cgit v1.2.3