aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-17 11:10:36 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-17 11:10:36 -0700
commit3287f08a788107815f366019060a7cbcfe9505d2 (patch)
tree29a867d2cf84d116b26be37508d4ea6462dede88 /python/sandcrawler/persist.py
parent5a6bf449ac78586bf150216fe2310be178eeb6c3 (diff)
downloadsandcrawler-3287f08a788107815f366019060a7cbcfe9505d2.tar.gz
sandcrawler-3287f08a788107815f366019060a7cbcfe9505d2.zip
workers: refactor to pass key to process()
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py12
1 files changed, 6 insertions, 6 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f2a4893..338cdfc 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -35,7 +35,7 @@ class PersistCdxWorker(SandcrawlerWorker):
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record):
+ def process(self, record, key=None):
"""
Only do batches (as transactions)
"""
@@ -60,7 +60,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record):
+ def process(self, record, key=None):
"""
Only do batches (as transactions)
"""
@@ -203,7 +203,7 @@ class PersistIngestRequestWorker(PersistIngestFileResultWorker):
def __init__(self, db_url, **kwargs):
super().__init__(db_url=db_url)
- def process(self, record):
+ def process(self, record, key=None):
"""
Only do batches (as transactions)
"""
@@ -243,7 +243,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.db_only = kwargs.get('db_only', False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
- def process(self, record):
+ def process(self, record, key=None):
"""
Only do batches (as transactions)
"""
@@ -327,7 +327,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
)
return obj_path
- def process(self, record):
+ def process(self, record, key=None):
if record.get('status_code') != 200 or not record.get('tei_xml'):
return False
@@ -347,7 +347,7 @@ class PersistPdfTrioWorker(SandcrawlerWorker):
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record):
+ def process(self, record, key=None):
"""
Only do batches (as transactions)
"""