diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 16:24:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 16:24:16 -0800 |
commit | 644c6abdb424a3759e06df6b2541d41fb353e95c (patch) | |
tree | 9b5f998ea4ded6ea9d09ee266b0e29fef8bdaddf | |
parent | c145488142d4b5413323322dfc1422efdece83f7 (diff) | |
download | sandcrawler-644c6abdb424a3759e06df6b2541d41fb353e95c.tar.gz sandcrawler-644c6abdb424a3759e06df6b2541d41fb353e95c.zip |
tweak html_meta SQL schema
-rw-r--r-- | python/sandcrawler/db.py | 31 | ||||
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 4 |
2 files changed, 21 insertions, 14 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 573f747..066e53b 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -1,6 +1,7 @@ import json import datetime +from typing import Optional import psycopg2 import psycopg2.extras @@ -43,12 +44,15 @@ class SandcrawlerPostgrestClient: else: return None - def get_html_meta(self, sha1): - resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1)) + def get_html_meta(self, sha1hex: str) -> Optional[dict]: + resp = requests.get( + self.api_url + "/html_meta", + params=dict(sha1hex=f"eq.{sha1hex}"), + ) resp.raise_for_status() - resp = resp.json() - if resp: - return resp[0] + resp_json = resp.json() + if resp_json: + return resp_json[0] else: return None @@ -61,12 +65,15 @@ class SandcrawlerPostgrestClient: else: return None - def get_ingest_file_result(self, url): - resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url)) + def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]: + resp = requests.get( + self.api_url + "/ingest_file_result", + params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"), + ) resp.raise_for_status() - resp = resp.json() - if resp: - return resp[0] + resp_json = resp.json() + if resp_json: + return resp_json[0] else: return None @@ -247,7 +254,7 @@ class SandcrawlerPostgresClient: """ sql = """ INSERT INTO - html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources) + html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources) VALUES %s ON CONFLICT (sha1hex) DO """ @@ -257,10 +264,10 @@ class SandcrawlerPostgresClient: sql += """ UPDATE SET updated=EXCLUDED.updated, status=EXCLUDED.status, + scope=EXCLUDED.scope, has_teixml=EXCLUDED.has_teixml, has_thumbnail=EXCLUDED.has_thumbnail, word_count=EXCLUDED.word_count, - resource_count=EXCLUDED.resource_count, biblio=EXCLUDED.biblio, resources=EXCLUDED.resources """ diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 6a8c52b..73bd7f1 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -118,12 +118,12 @@ CREATE TABLE IF NOT EXISTS html_meta ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), has_teixml BOOLEAN NOT NULL, has_thumbnail BOOLEAN NOT NULL, word_count INT CHECK (word_count >= 0), - resource_count INT CHECK (resource_count >= 0), biblio JSONB, - resources JSONB, + resources JSONB -- biblio JSON fields are similar to fatcat release schema -- resources JSON object is a list of objects with keys like webcapture CDX schema ); |