aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-26 17:00:06 +0000
committerBryan Newbold <bnewbold@archive.org>2018-08-26 17:00:08 +0000
commitd2b4da4c55a24468a0cbfdc9f567449d4e913331 (patch)
tree4260ec4ef0c070ececcebc627f3016f9944ad520 /python
parentec67bbe00efe04f120bb9c278da61545af436a4c (diff)
downloadsandcrawler-d2b4da4c55a24468a0cbfdc9f567449d4e913331.tar.gz
sandcrawler-d2b4da4c55a24468a0cbfdc9f567449d4e913331.zip
finally got extraction_ungrobided to run in prod
Problem was that only one python script was getting sent, so couldn't "import from". Should refactor shared code into a new common file.
Diffstat (limited to 'python')
-rwxr-xr-xpython/extraction_ungrobided.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index 8224dbb..4074112 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -32,8 +32,14 @@ from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
from common import parse_ungrobided_line
from grobid2json import teixml2json
-from extraction_cdx_grobid import KEY_BLACKLIST, sentry_client
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_BLACKLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
class MRExtractUnGrobided(MRJob):
@@ -62,9 +68,6 @@ class MRExtractUnGrobided(MRJob):
type=str,
default='https://archive.org/serve/',
help='URI where WARCs can be found')
- self.add_passthru_arg('--force-existing',
- action="store_true",
- help='Re-processes (with GROBID) existing lines')
def __init__(self, *args, **kwargs):
super(MRExtractUnGrobided, self).__init__(*args, **kwargs)