aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 17:02:34 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 17:02:34 -0800
commitab01652b9d4b5542a973f591031b54cdcfd4701f (patch)
tree46e5b537cdd1d699403163cd4357d6991485cd35
parent506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0 (diff)
downloadsandcrawler-ab01652b9d4b5542a973f591031b54cdcfd4701f.tar.gz
sandcrawler-ab01652b9d4b5542a973f591031b54cdcfd4701f.zip
ingest persist skips 'existing' ingest results
-rw-r--r--python/sandcrawler/persist.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index c24dec8..7cb4f8d 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -133,6 +133,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if ingest_type not in ('pdf', 'xml'):
self.counts['skip-ingest-type'] += 1
return None
+ if raw['status'] in ("existing", ):
+ self.counts['skip-existing'] += 1
+ return None
result = {
'ingest_type': ingest_type,
'base_url': raw['request']['base_url'],