From 806967ca168bcdbf2e57699703904333c21d4a2f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 11:26:16 -0800 Subject: html: start on SQL table --- sql/migrations/2019-12-19-060141_init/up.sql | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'sql') diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 59423dd..6a8c52b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta ( -- encrypted ); +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + resource_count INT CHECK (resource_count >= 0), + biblio JSONB, + resources JSONB, + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), @@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request ( -- ext_ids (source/source_id sometimes enough) -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) -- edit_extra + -- ingest type can be: pdf, xml, html PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); -- cgit v1.2.3 From bd9075adef2733df046621ef799c3b29e00fac57 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 11:27:16 -0800 Subject: monitoring: past-7-days summary query --- sql/monitoring_queries.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'sql') diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md index 1738731..1c872cc 100644 --- a/sql/monitoring_queries.md +++ b/sql/monitoring_queries.md @@ -39,6 +39,32 @@ Broken domains, past 30 days: ORDER BY COUNT DESC LIMIT 25; +Summary of significant domains and status, past 7 days: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 500 + ORDER BY domain ASC , count DESC; + + Throughput per day, and success, for past 30 days: SELECT ingest_request.ingest_type, -- cgit v1.2.3 From 55815b2e62a3ce53d5e71d0c6fd676b6cbf5baca Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 11:31:38 -0800 Subject: SQL: unmatched glutton query (old) --- sql/dump_unmatched_glutton_pdf.sql | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 sql/dump_unmatched_glutton_pdf.sql (limited to 'sql') diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql new file mode 100644 index 0000000..d089c7e --- /dev/null +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -0,0 +1,19 @@ + +-- Run like: +-- psql sandcrawler < THING.sql > THING.2019-09-23.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(grobid) + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL + LIMIT 1000 +) +TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json'; +--TO STDOUT +--WITH NULL ''; + +ROLLBACK; -- cgit v1.2.3 From 644c6abdb424a3759e06df6b2541d41fb353e95c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 16:24:16 -0800 Subject: tweak html_meta SQL schema --- python/sandcrawler/db.py | 31 +++++++++++++++++----------- sql/migrations/2019-12-19-060141_init/up.sql | 4 ++-- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'sql') diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 573f747..066e53b 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -1,6 +1,7 @@ import json import datetime +from typing import Optional import psycopg2 import psycopg2.extras @@ -43,12 +44,15 @@ class SandcrawlerPostgrestClient: else: return None - def get_html_meta(self, sha1): - resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1)) + def get_html_meta(self, sha1hex: str) -> Optional[dict]: + resp = requests.get( + self.api_url + "/html_meta", + params=dict(sha1hex=f"eq.{sha1hex}"), + ) resp.raise_for_status() - resp = resp.json() - if resp: - return resp[0] + resp_json = resp.json() + if resp_json: + return resp_json[0] else: return None @@ -61,12 +65,15 @@ class SandcrawlerPostgrestClient: else: return None - def get_ingest_file_result(self, url): - resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url)) + def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]: + resp = requests.get( + self.api_url + "/ingest_file_result", + params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"), + ) resp.raise_for_status() - resp = resp.json() - if resp: - return resp[0] + resp_json = resp.json() + if resp_json: + return resp_json[0] else: return None @@ -247,7 +254,7 @@ class SandcrawlerPostgresClient: """ sql = """ INSERT INTO - html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources) + html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources) VALUES %s ON CONFLICT (sha1hex) DO """ @@ -257,10 +264,10 @@ class SandcrawlerPostgresClient: sql += """ UPDATE SET updated=EXCLUDED.updated, status=EXCLUDED.status, + scope=EXCLUDED.scope, has_teixml=EXCLUDED.has_teixml, has_thumbnail=EXCLUDED.has_thumbnail, word_count=EXCLUDED.word_count, - resource_count=EXCLUDED.resource_count, biblio=EXCLUDED.biblio, resources=EXCLUDED.resources """ diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 6a8c52b..73bd7f1 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -118,12 +118,12 @@ CREATE TABLE IF NOT EXISTS html_meta ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), has_teixml BOOLEAN NOT NULL, has_thumbnail BOOLEAN NOT NULL, word_count INT CHECK (word_count >= 0), - resource_count INT CHECK (resource_count >= 0), biblio JSONB, - resources JSONB, + resources JSONB -- biblio JSON fields are similar to fatcat release schema -- resources JSON object is a list of objects with keys like webcapture CDX schema ); -- cgit v1.2.3