diff options
| -rw-r--r-- | python/sandcrawler/db.py | 44 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 15 | 
2 files changed, 59 insertions, 0 deletions
| diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 793f1c4..573f747 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -43,6 +43,15 @@ class SandcrawlerPostgrestClient:          else:              return None +    def get_html_meta(self, sha1): +        resp = requests.get(self.api_url + "/html_meta", params=dict(sha1hex='eq.'+sha1)) +        resp.raise_for_status() +        resp = resp.json() +        if resp: +            return resp[0] +        else: +            return None +      def get_file_meta(self, sha1):          resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))          resp.raise_for_status() @@ -232,6 +241,41 @@ class SandcrawlerPostgresClient:          resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)          return self._inserts_and_updates(resp, on_conflict) +    def insert_html_meta(self, cur, batch, on_conflict="nothing"): +        """ +        batch elements are expected to have .to_sql_tuple() method +        """ +        sql = """ +            INSERT INTO +            html_meta (sha1hex, updated, status, has_teixml, has_thumbnail, word_count, resource_count, biblio, resources) +            VALUES %s +            ON CONFLICT (sha1hex) DO +        """ +        if on_conflict.lower() == "nothing": +            sql += " NOTHING" +        elif on_conflict.lower() == "update": +            sql += """ UPDATE SET +                updated=EXCLUDED.updated, +                status=EXCLUDED.status, +                has_teixml=EXCLUDED.has_teixml, +                has_thumbnail=EXCLUDED.has_thumbnail, +                word_count=EXCLUDED.word_count, +                resource_count=EXCLUDED.resource_count, +                biblio=EXCLUDED.biblio, +                resources=EXCLUDED.resources +            """ +        else: +            raise NotImplementedError("on_conflict: {}".format(on_conflict)) +        sql += " RETURNING xmax;" +        batch = [d.to_sql_tuple() for d in batch] +        # filter out duplicate rows by key (sha1hex) +        batch_dict = dict() +        for b in batch: +            batch_dict[b[0]] = b +        batch = list(batch_dict.values()) +        resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) +        return self._inserts_and_updates(resp, on_conflict) +      def insert_pdftrio(self, cur, batch, on_conflict="nothing"):          sql = """              INSERT INTO diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 59423dd..6a8c52b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta (      --    encrypted  ); +CREATE TABLE IF NOT EXISTS html_meta ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    status              TEXT CHECK (octet_length(status) >= 1) NOT NULL, +    has_teixml          BOOLEAN NOT NULL, +    has_thumbnail       BOOLEAN NOT NULL, +    word_count          INT CHECK (word_count >= 0), +    resource_count      INT CHECK (resource_count >= 0), +    biblio              JSONB, +    resources           JSONB, +    -- biblio JSON fields are similar to fatcat release schema +    -- resources JSON object is a list of objects with keys like webcapture CDX schema +); +  CREATE TABLE IF NOT EXISTS ingest_request (      link_source             TEXT NOT NULL CHECK (octet_length(link_source) >= 1),      link_source_id          TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), @@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (      --   ext_ids (source/source_id sometimes enough)      --   fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)      --   edit_extra +    -- ingest type can be: pdf, xml, html      PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)  ); | 
