diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-19 05:14:33 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-19 05:15:02 +0000 |
commit | df23b6f45922875f0bf657aea3b8c3fb4451469d (patch) | |
tree | d7fc0825aa2b1cff46af4cb5397a7ddedf04df0b | |
parent | e0d1e381bf536d1c077546526c21eab909444193 (diff) | |
download | sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.tar.gz sandcrawler-df23b6f45922875f0bf657aea3b8c3fb4451469d.zip |
force_existing flag for extraction
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index e23950c..6659f61 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -64,6 +64,9 @@ class MRExtractCdxGrobid(MRJob): type=str, default='https://archive.org/serve/', help='URI where WARCs can be found') + self.add_passthru_arg('--force-existing', + action="store_true", + help='Re-processes (with GROBID) existing lines') def __init__(self, *args, **kwargs): super(MRExtractCdxGrobid, self).__init__(*args, **kwargs) @@ -201,7 +204,8 @@ class MRExtractCdxGrobid(MRJob): # Check if we've already processed this line oldrow = self.hb_table.row(key, columns=[b'f:c', b'file', b'grobid0:status_code']) - if oldrow.get(b'grobid0:status_code', None) != None: + if (oldrow.get(b'grobid0:status_code', None) != None + and not self.options.force_existing): # This file has already been processed; skip it self.increment_counter('lines', 'existing') yield _, dict(status="existing", key=key) |