aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/extraction_cdx_grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index e23950c..6659f61 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -64,6 +64,9 @@ class MRExtractCdxGrobid(MRJob):
type=str,
default='https://archive.org/serve/',
help='URI where WARCs can be found')
+ self.add_passthru_arg('--force-existing',
+ action="store_true",
+ help='Re-processes (with GROBID) existing lines')
def __init__(self, *args, **kwargs):
super(MRExtractCdxGrobid, self).__init__(*args, **kwargs)
@@ -201,7 +204,8 @@ class MRExtractCdxGrobid(MRJob):
# Check if we've already processed this line
oldrow = self.hb_table.row(key,
columns=[b'f:c', b'file', b'grobid0:status_code'])
- if oldrow.get(b'grobid0:status_code', None) != None:
+ if (oldrow.get(b'grobid0:status_code', None) != None
+ and not self.options.force_existing):
# This file has already been processed; skip it
self.increment_counter('lines', 'existing')
yield _, dict(status="existing", key=key)