diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-26 17:00:06 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-26 17:00:08 +0000 |
commit | d2b4da4c55a24468a0cbfdc9f567449d4e913331 (patch) | |
tree | 4260ec4ef0c070ececcebc627f3016f9944ad520 /python | |
parent | ec67bbe00efe04f120bb9c278da61545af436a4c (diff) | |
download | sandcrawler-d2b4da4c55a24468a0cbfdc9f567449d4e913331.tar.gz sandcrawler-d2b4da4c55a24468a0cbfdc9f567449d4e913331.zip |
finally got extraction_ungrobided to run in prod
Problem was that only one python script was getting sent, so couldn't
"import from". Should refactor shared code into a new common file.
Diffstat (limited to 'python')
-rwxr-xr-x | python/extraction_ungrobided.py | 11 |
1 files changed, 7 insertions, 4 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 8224dbb..4074112 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -32,8 +32,14 @@ from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory from common import parse_ungrobided_line from grobid2json import teixml2json -from extraction_cdx_grobid import KEY_BLACKLIST, sentry_client +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + +# Specific poison-pill rows we should skip +KEY_BLACKLIST = ( + 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format" +) class MRExtractUnGrobided(MRJob): @@ -62,9 +68,6 @@ class MRExtractUnGrobided(MRJob): type=str, default='https://archive.org/serve/', help='URI where WARCs can be found') - self.add_passthru_arg('--force-existing', - action="store_true", - help='Re-processes (with GROBID) existing lines') def __init__(self, *args, **kwargs): super(MRExtractUnGrobided, self).__init__(*args, **kwargs) |