aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-05 00:41:14 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-05 00:41:14 -0800
commit97ffdc17941b1272a8c7f05c0d1353cd28761280 (patch)
tree07309077043139328c9be809ac5f26971a2729c9 /python/sandcrawler/persist.py
parent173e5e88de4160a63949ff6e263123c4a25b2017 (diff)
downloadsandcrawler-97ffdc17941b1272a8c7f05c0d1353cd28761280.tar.gz
sandcrawler-97ffdc17941b1272a8c7f05c0d1353cd28761280.zip
persist: ingest_request tool (with no ingest_file_result)
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py29
1 files changed, 29 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 88ac6b5..5960756 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -198,6 +198,35 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
self.db.commit()
return []
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ if not batch:
+ return []
+
+ requests = [self.request_to_row(raw) for raw in batch]
+ requests = [r for r in requests if r]
+
+ if requests:
+ resp = self.db.insert_ingest_request(self.cur, requests)
+ self.counts['insert-requests'] += resp[0]
+ self.counts['update-requests'] += resp[1]
+
+ self.db.commit()
+ return []
class PersistGrobidWorker(SandcrawlerWorker):