aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 11:26:16 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 11:26:16 -0800
commit806967ca168bcdbf2e57699703904333c21d4a2f (patch)
treeb63b24666d9da5fda01a5ff6bd824d09d0deb950
parent206c4f6ba10417cfb463be9101d71291bd0e458c (diff)
downloadsandcrawler-806967ca168bcdbf2e57699703904333c21d4a2f.tar.gz
sandcrawler-806967ca168bcdbf2e57699703904333c21d4a2f.zip
html: start on SQL table
-rw-r--r--python/sandcrawler/db.py44
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql15
2 files changed, 59 insertions, 0 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 793f1c4..573f747 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -43,6 +43,15 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_html_meta(self, sha1):
+ resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
def get_file_meta(self, sha1):
resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
resp.raise_for_status()
@@ -232,6 +241,41 @@ class SandcrawlerPostgresClient:
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
+ def insert_html_meta(self, cur, batch, on_conflict="nothing"):
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ has_teixml=EXCLUDED.has_teixml,
+ has_thumbnail=EXCLUDED.has_thumbnail,
+ word_count=EXCLUDED.word_count,
+ resource_count=EXCLUDED.resource_count,
+ biblio=EXCLUDED.biblio,
+ resources=EXCLUDED.resources
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [d.to_sql_tuple() for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
sql = """
INSERT INTO
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 59423dd..6a8c52b 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta (
-- encrypted
);
+CREATE TABLE IF NOT EXISTS html_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_teixml BOOLEAN NOT NULL,
+ has_thumbnail BOOLEAN NOT NULL,
+ word_count INT CHECK (word_count >= 0),
+ resource_count INT CHECK (resource_count >= 0),
+ biblio JSONB,
+ resources JSONB,
+ -- biblio JSON fields are similar to fatcat release schema
+ -- resources JSON object is a list of objects with keys like webcapture CDX schema
+);
+
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
@@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
-- ext_ids (source/source_id sometimes enough)
-- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
-- edit_extra
+ -- ingest type can be: pdf, xml, html
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);