From 806967ca168bcdbf2e57699703904333c21d4a2f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 11:26:16 -0800 Subject: html: start on SQL table --- python/sandcrawler/db.py | 44 ++++++++++++++++++++++++++++ sql/migrations/2019-12-19-060141_init/up.sql | 15 ++++++++++ 2 files changed, 59 insertions(+) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 793f1c4..573f747 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -43,6 +43,15 @@ class SandcrawlerPostgrestClient: else: return None + def get_html_meta(self, sha1): + resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1)) + resp.raise_for_status() + resp = resp.json() + if resp: + return resp[0] + else: + return None + def get_file_meta(self, sha1): resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1)) resp.raise_for_status() @@ -232,6 +241,41 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) + def insert_html_meta(self, cur, batch, on_conflict="nothing"): + """ + batch elements are expected to have .to_sql_tuple() method + """ + sql = """ + INSERT INTO + html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources) + VALUES %s + ON CONFLICT (sha1hex) DO + """ + if on_conflict.lower() == "nothing": + sql += " NOTHING" + elif on_conflict.lower() == "update": + sql += """ UPDATE SET + updated=EXCLUDED.updated, + status=EXCLUDED.status, + has_teixml=EXCLUDED.has_teixml, + has_thumbnail=EXCLUDED.has_thumbnail, + word_count=EXCLUDED.word_count, + resource_count=EXCLUDED.resource_count, + biblio=EXCLUDED.biblio, + resources=EXCLUDED.resources + """ + else: + raise NotImplementedError("on_conflict: {}".format(on_conflict)) + sql += " RETURNING xmax;" + batch = [d.to_sql_tuple() for d in batch] + # filter out duplicate rows by key (sha1hex) + batch_dict = dict() + for b in batch: + batch_dict[b[0]] = b + batch = list(batch_dict.values()) + resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) + return self._inserts_and_updates(resp, on_conflict) + def insert_pdftrio(self, cur, batch, on_conflict="nothing"): sql = """ INSERT INTO diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 59423dd..6a8c52b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta ( -- encrypted ); +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + resource_count INT CHECK (resource_count >= 0), + biblio JSONB, + resources JSONB, + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), @@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request ( -- ext_ids (source/source_id sometimes enough) -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) -- edit_extra + -- ingest type can be: pdf, xml, html PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); -- cgit v1.2.3