force_existing flag for extraction

author: Bryan Newbold <bnewbold@archive.org> 2018-04-19 05:14:33 +0000
committer: Bryan Newbold <bnewbold@archive.org> 2018-04-19 05:15:02 +0000
commit: df23b6f45922875f0bf657aea3b8c3fb4451469d (patch)
tree: d7fc0825aa2b1cff46af4cb5397a7ddedf04df0b
parent: e0d1e381bf536d1c077546526c21eab909444193 (diff)
download: sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.tar.gz
sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.zip
1 files changed, 5 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index e23950c..6659f61 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -64,6 +64,9 @@ class MRExtractCdxGrobid(MRJob):
                               type=str,
                               default='https://archive.org/serve/',
                               help='URI where WARCs can be found')
+        self.add_passthru_arg('--force-existing',
+                              action="store_true",
+                              help='Re-processes (with GROBID) existing lines')
 
     def __init__(self, *args, **kwargs):
         super(MRExtractCdxGrobid, self).__init__(*args, **kwargs)
@@ -201,7 +204,8 @@ class MRExtractCdxGrobid(MRJob):
         # Check if we've already processed this line
         oldrow = self.hb_table.row(key,
             columns=[b'f:c', b'file', b'grobid0:status_code'])
-        if oldrow.get(b'grobid0:status_code', None) != None:
+        if (oldrow.get(b'grobid0:status_code', None) != None
+                and not self.options.force_existing):
             # This file has already been processed; skip it
             self.increment_counter('lines', 'existing')
             yield _, dict(status="existing", key=key)
author	Bryan Newbold <bnewbold@archive.org>	2018-04-19 05:14:33 +0000
committer	Bryan Newbold <bnewbold@archive.org>	2018-04-19 05:15:02 +0000
commit	df23b6f45922875f0bf657aea3b8c3fb4451469d (patch)
tree	d7fc0825aa2b1cff46af4cb5397a7ddedf04df0b
parent	e0d1e381bf536d1c077546526c21eab909444193 (diff)
download	sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.tar.gz sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.zip