diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:02:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:02:34 -0800 |
commit | ab01652b9d4b5542a973f591031b54cdcfd4701f (patch) | |
tree | 46e5b537cdd1d699403163cd4357d6991485cd35 | |
parent | 506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0 (diff) | |
download | sandcrawler-ab01652b9d4b5542a973f591031b54cdcfd4701f.tar.gz sandcrawler-ab01652b9d4b5542a973f591031b54cdcfd4701f.zip |
ingest persist skips 'existing' ingest results
-rw-r--r-- | python/sandcrawler/persist.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index c24dec8..7cb4f8d 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -133,6 +133,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if ingest_type not in ('pdf', 'xml'): self.counts['skip-ingest-type'] += 1 return None + if raw['status'] in ("existing", ): + self.counts['skip-existing'] += 1 + return None result = { 'ingest_type': ingest_type, 'base_url': raw['request']['base_url'], |