From 2a1e99c7fd000d4f49dff06361c5214f08f0d03f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 1 Aug 2019 18:14:30 -0700 Subject: start of postgres/postgrest notes and schema --- postgresql/README.md | 120 ++++++++++++++++++++++++++++++++++++++ postgresql/sandcrawler_schema.sql | 57 ++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 postgresql/README.md create mode 100644 postgresql/sandcrawler_schema.sql diff --git a/postgresql/README.md b/postgresql/README.md new file mode 100644 index 0000000..4774adb --- /dev/null +++ b/postgresql/README.md @@ -0,0 +1,120 @@ + +TL;DR: replace hbase with postgresql tables with REST API (http://postgrest.org) + +No primary storage of anything in this table. Everything should be rapidly +re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc. +This is a secondary view on all of that. + +## Schema + + schema/database name is 'sandcrawler' + + cdx: include revisits or not? + id: int64, PK + sha1hex: string, not null, index + cdx_sha1hex: string + url: string, not null + datetime: ISO 8601:1988 (string?), not null + mimetype: string + warc_path: string (item and filename) + warc_offset: i64 + created: datetime, index (?) + ?crawl: string + ?domain: string + + file_meta + sha1hex, string, PK + md5hex: string + sha256hex: string + size_bytes: i64 + mime: string (verifying file status; optional for now?) + + fatcat_file + sha1hex: string, PK + file_ident: string, index? + release_ident: ? + + petabox + id: int64, PK + sha1hex: string, notnull, index + item: string, notnull + path: string, notnull (TODO: URL encoded? separate sub-archive path?) + + grobid + sha1hex: string, PK + updated: datetime + grobid_version (string) + status_code: i32 + status: string (JSONB?), only if status != 200 + metadata: JSONB, title, first author, year (not for now?) + glutton_fatcat_release: string, index + + shadow + sha1hex: string, PK + shadow_corpus: string, PK + shadow_id: string + doi: string + pmid: string + isbn13: string + +Alternatively, to be more like existing system could have "one big table" or +multiple tables all with same key (sha1b32) and UNIQ. As is, every sha1 pk +column is 40 bytes of both index and data, or 8+ GByte (combined) for each +table with 100 million rows. using raw bytes could help, but makes all +code/queries much trickier. + +Should we have "created" or "updated" timestamps on all these columns to enable +kafka tailing? + +TODO: +- how to indicate CDX sha1 vs. true sha1 mis-match? pretty rare. recrawl and delete row from `gwb_cdx`? +- only most recent GROBID? or keep multiple versions? here and minio + +## Existing Stuff Sizes (estimates) + + 78.5G /user/bnewbold/journal_crawl_cdx + 19.7G /user/bnewbold/sandcrawler/output-prod/2018-12-14-1737.00-dumpfilemeta + 2.7G file_hashes.tsv + 228.5G /user/bnewbold/sandcrawler/output-prod/2018-09-23-0405.30-dumpgrobidmetainsertable + +## Use Cases + +Core goal here is to mostly kill hbase/hadoop. What jobs are actually used there? + +- backfill: load in-scope (fulltext) crawl results from CDX + => bulk (many line) inserts +- rowcount: "how many unique PDFs crawled?" + => trivial SQL query +- status code count: "how much GROBID progress?" + => trivial SQL query +- dumpungrobided: "what files still need to be processed" + => SQL join with a "first" on CDX side +- dumpgrobidxml: "merge CDX/file info with extracted XML, for those that were successful" + => SQL dump or rowscan, then minio fetches + +This table is generally "single file raw fulltext metadata". + +"Enrichment" jobs: + +- GROBID +- glutton (if not GROBID) +- extra file metadata +- match newly enriched files to fatcat + +What else? + +- track additional raw file metadata +- dump all basic GROBID metadata (title, authors, year) to attempt merge/match + +Questions we might want to answer + +- total size of PDF corpus (terabytes) +- unqiue files hit per domain + +## Prototype Plan + +- backfill all CDX crawl files (TSV transform?) +- load full GROBID XML (both into minio and into SQL) +- load full fatcat file dump (TSV transform) +- load dumpfilemeta + diff --git a/postgresql/sandcrawler_schema.sql b/postgresql/sandcrawler_schema.sql new file mode 100644 index 0000000..9e5651d --- /dev/null +++ b/postgresql/sandcrawler_schema.sql @@ -0,0 +1,57 @@ + +CREATE TABLE cdx ( + id BIGSERIAL PRIMARY KEY, + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TIMESTAMP WITH TIME ZONE NOT NULL, + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL +); +CREATE INDEX cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX cdx_row_created_idx ON cdx(row_created); + +CREATE TABLE file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); + +CREATE TABLE fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE petabox ( + id BIGSERIAL PRIMARY KEY, + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT NOT NULL CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE shadow ( + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT CHECK (octet_length(shadow_id) >= 1), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(sha1hex, shadow_corpus) +); -- cgit v1.2.3