aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 16:24:16 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 16:24:16 -0800
commit644c6abdb424a3759e06df6b2541d41fb353e95c (patch)
tree9b5f998ea4ded6ea9d09ee266b0e29fef8bdaddf
parentc145488142d4b5413323322dfc1422efdece83f7 (diff)
downloadsandcrawler-644c6abdb424a3759e06df6b2541d41fb353e95c.tar.gz
sandcrawler-644c6abdb424a3759e06df6b2541d41fb353e95c.zip
tweak html_meta SQL schema
-rw-r--r--python/sandcrawler/db.py31
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql4
2 files changed, 21 insertions, 14 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 573f747..066e53b 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,6 +1,7 @@
import json
import datetime
+from typing import Optional
import psycopg2
import psycopg2.extras
@@ -43,12 +44,15 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_html_meta(self, sha1):
- resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_html_meta(self, sha1hex: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/html_meta",
+ params=dict(sha1hex=f"eq.{sha1hex}"),
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
@@ -61,12 +65,15 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_ingest_file_result(self, url):
- resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url))
+ def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/ingest_file_result",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
@@ -247,7 +254,7 @@ class SandcrawlerPostgresClient:
"""
sql = """
INSERT INTO
- html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources)
+ html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources)
VALUES %s
ON CONFLICT (sha1hex) DO
"""
@@ -257,10 +264,10 @@ class SandcrawlerPostgresClient:
sql += """ UPDATE SET
updated=EXCLUDED.updated,
status=EXCLUDED.status,
+ scope=EXCLUDED.scope,
has_teixml=EXCLUDED.has_teixml,
has_thumbnail=EXCLUDED.has_thumbnail,
word_count=EXCLUDED.word_count,
- resource_count=EXCLUDED.resource_count,
biblio=EXCLUDED.biblio,
resources=EXCLUDED.resources
"""
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 6a8c52b..73bd7f1 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -118,12 +118,12 @@ CREATE TABLE IF NOT EXISTS html_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ scope TEXT CHECK (octet_length(status) >= 1),
has_teixml BOOLEAN NOT NULL,
has_thumbnail BOOLEAN NOT NULL,
word_count INT CHECK (word_count >= 0),
- resource_count INT CHECK (resource_count >= 0),
biblio JSONB,
- resources JSONB,
+ resources JSONB
-- biblio JSON fields are similar to fatcat release schema
-- resources JSON object is a list of objects with keys like webcapture CDX schema
);