implement counts properly for persist workers

author: Bryan Newbold <bnewbold@archive.org> 2019-12-26 19:22:49 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-02 18:12:58 -0800
commit: 33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8 (patch)
tree: 7e0b592a1627c75d0ee00efff818c49c9a7b7d57
parent: 0756b3901e48844b4c482ef43c409699497ec3b9 (diff)
download: sandcrawler-33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8.tar.gz
sandcrawler-33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8.zip
1 files changed, 19 insertions, 15 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 07e6c83..86a1c22 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -40,8 +40,8 @@ class PersistCdxWorker(SandcrawlerWorker):
 
     def push_batch(self, batch):
         self.counts['total'] += len(batch)
-        self.db.insert_cdx(self.cur, batch)
-        self.counts['insert-cdx'] += len(batch)
+        resp = self.db.insert_cdx(self.cur, batch)
+        self.counts['insert-cdx'] += resp
         self.db.commit()
         return []
 
@@ -159,22 +159,22 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         requests = [r for r in requests if r]
 
         if requests:
-            self.db.insert_ingest_request(self.cur, requests)
-            self.counts['insert-requests'] += len(requests)
+            resp = self.db.insert_ingest_request(self.cur, requests)
+            self.counts['insert-requests'] += resp
         if results:
-            self.db.insert_ingest_file_result(self.cur, results)
-            self.counts['insert-results'] += len(results)
+            resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
+            self.counts['insert-results'] += resp
 
         # these schemas match, so can just pass through
         # TODO: need to include warc_path etc in ingest-result
         cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx') and r['cdx'].get('warc_path')]
         if cdx_batch:
-            self.db.insert_cdx(self.cur, cdx_batch)
-            self.counts['insert-cdx'] += len(cdx_batch)
+            resp = self.db.insert_cdx(self.cur, cdx_batch)
+            self.counts['insert-cdx'] += resp
         file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
         if file_meta_batch:
-            self.db.insert_file_meta(self.cur, file_meta_batch)
-            self.counts['insert-file_meta'] += len(file_meta_batch)
+            resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+            self.counts['insert-file_meta'] += resp
 
         self.db.commit()
         return []
@@ -205,18 +205,22 @@ class PersistGrobidWorker(SandcrawlerWorker):
             if not metadata:
                 continue
             for k in ('fatcat_release', 'grobid_version'):
-                r[k] = metadata.pop(k)
+                r[k] = metadata.pop(k, None)
             if r.get('fatcat_release'):
                 r['fatcat_release'] = r['fatcat_release'].replace('release_', '')
+            if metadata.get('grobid_timestamp'):
+                r['updated'] = metadata['grobid_timestamp']
             r['metadata'] = metadata
 
         grobid_batch = [r['grobid'] for r in batch if r.get('grobid')]
-        self.db.insert_grobid(self.cur, batch)
+        resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
+        self.counts['insert-grobid'] += resp
 
-        file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
-        self.db.insert_file_meta(self.cur, file_meta_batch)
+        file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+        resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+        self.counts['insert-file-meta'] += resp
 
-        # TODO: minio, grobid
+        # TODO: minio
 
         self.db.commit()
         return []
author	Bryan Newbold <bnewbold@archive.org>	2019-12-26 19:22:49 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-02 18:12:58 -0800
commit	33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8 (patch)
tree	7e0b592a1627c75d0ee00efff818c49c9a7b7d57
parent	0756b3901e48844b4c482ef43c409699497ec3b9 (diff)
download	sandcrawler-33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8.tar.gz sandcrawler-33f6744b56a9ca7b01cb4ed7b80bdf70a972ffa8.zip