aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-30 16:46:44 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-30 16:46:44 -0700
commit2d85d55ed1af4220f1469f4cf0cddd7e0888b3b3 (patch)
treed19a42fac5bf922c364b859498cb35d66519fc40 /python
parentabaa0c53c8f2aaff3c533747c2f310d8f60839c9 (diff)
downloadsandcrawler-2d85d55ed1af4220f1469f4cf0cddd7e0888b3b3.tar.gz
sandcrawler-2d85d55ed1af4220f1469f4cf0cddd7e0888b3b3.zip
workers: add missing want() dataflow path
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/workers.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 96aef3f..6425e99 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -27,6 +27,9 @@ class SandcrawlerWorker(object):
def push_record(self, task):
self.counts['total'] += 1
+ if not self.want(task):
+ self.counts['skip'] += 1
+ return
result = self.process(task)
if not result:
self.counts['failed'] += 1
@@ -90,6 +93,12 @@ class SandcrawlerWorker(object):
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
+ def want(self, task):
+ """
+ Optionally override this as a filter in implementations.
+ """
+ return True
+
def process(self, task):
"""
Derived workers need to implement business logic here.