From b4a40d99b23a83eabeed490c0dce52dba31dc7b8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 21:23:08 -0700 Subject: fixes and tweaks from testing locally --- python/sandcrawler/db.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'python/sandcrawler/db.py') diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 03cc15f..793f1c4 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -34,6 +34,15 @@ class SandcrawlerPostgrestClient: else: return None + def get_pdf_meta(self, sha1): + resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1)) + resp.raise_for_status() + resp = resp.json() + if resp: + return resp[0] + else: + return None + def get_file_meta(self, sha1): resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1)) resp.raise_for_status() @@ -185,6 +194,44 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) + def insert_pdf_meta(self, cur, batch, on_conflict="nothing"): + """ + batch elements are expected to have .to_sql_tuple() method + """ + sql = """ + INSERT INTO + pdf_meta (sha1hex, updated, status, has_page0_thumbnail, page_count, word_count, page0_height, page0_width, permanent_id, pdf_created, pdf_version, metadata) + VALUES %s + ON CONFLICT (sha1hex) DO + """ + if on_conflict.lower() == "nothing": + sql += " NOTHING" + elif on_conflict.lower() == "update": + sql += """ UPDATE SET + updated=EXCLUDED.updated, + status=EXCLUDED.status, + has_page0_thumbnail=EXCLUDED.has_page0_thumbnail, + page_count=EXCLUDED.page_count, + word_count=EXCLUDED.word_count, + page0_height=EXCLUDED.page0_height, + page0_width=EXCLUDED.page0_width, + permanent_id=EXCLUDED.permanent_id, + pdf_created=EXCLUDED.pdf_created, + pdf_version=EXCLUDED.pdf_version, + metadata=EXCLUDED.metadata + """ + else: + raise NotImplementedError("on_conflict: {}".format(on_conflict)) + sql += " RETURNING xmax;" + batch = [d.to_sql_tuple() for d in batch] + # filter out duplicate rows by key (sha1hex) + batch_dict = dict() + for b in batch: + batch_dict[b[0]] = b + batch = list(batch_dict.values()) + resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) + return self._inserts_and_updates(resp, on_conflict) + def insert_pdftrio(self, cur, batch, on_conflict="nothing"): sql = """ INSERT INTO -- cgit v1.2.3