diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-05 00:41:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-05 00:41:14 -0800 |
commit | 97ffdc17941b1272a8c7f05c0d1353cd28761280 (patch) | |
tree | 07309077043139328c9be809ac5f26971a2729c9 /python/sandcrawler/persist.py | |
parent | 173e5e88de4160a63949ff6e263123c4a25b2017 (diff) | |
download | sandcrawler-97ffdc17941b1272a8c7f05c0d1353cd28761280.tar.gz sandcrawler-97ffdc17941b1272a8c7f05c0d1353cd28761280.zip |
persist: ingest_request tool (with no ingest_file_result)
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 88ac6b5..5960756 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -198,6 +198,35 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): self.db.commit() return [] +class PersistIngestRequestWorker(PersistIngestFileResultWorker): + + def __init__(self, db_url, **kwargs): + super().__init__() + self.db = SandcrawlerPostgresClient(db_url) + self.cur = self.db.conn.cursor() + + def process(self, record): + """ + Only do batches (as transactions) + """ + raise NotImplementedError + + def push_batch(self, batch): + self.counts['total'] += len(batch) + + if not batch: + return [] + + requests = [self.request_to_row(raw) for raw in batch] + requests = [r for r in requests if r] + + if requests: + resp = self.db.insert_ingest_request(self.cur, requests) + self.counts['insert-requests'] += resp[0] + self.counts['update-requests'] += resp[1] + + self.db.commit() + return [] class PersistGrobidWorker(SandcrawlerWorker): |