diff options
Diffstat (limited to 'postgresql')
| -rw-r--r-- | postgresql/README.md | 120 | ||||
| -rw-r--r-- | postgresql/sandcrawler_schema.sql | 57 | 
2 files changed, 177 insertions, 0 deletions
| diff --git a/postgresql/README.md b/postgresql/README.md new file mode 100644 index 0000000..4774adb --- /dev/null +++ b/postgresql/README.md @@ -0,0 +1,120 @@ + +TL;DR: replace hbase with postgresql tables with REST API (http://postgrest.org) + +No primary storage of anything in this table. Everything should be rapidly +re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc. +This is a secondary view on all of that. + +## Schema + +    schema/database name is 'sandcrawler' + +    cdx: include revisits or not? +        id: int64, PK +        sha1hex: string, not null, index +        cdx_sha1hex: string +        url: string, not null +        datetime: ISO 8601:1988 (string?), not null +        mimetype: string +        warc_path: string (item and filename) +        warc_offset: i64 +        created: datetime, index (?) +       ?crawl: string +       ?domain: string + +    file_meta +        sha1hex, string, PK +        md5hex: string +        sha256hex: string +        size_bytes: i64 +        mime: string (verifying file status; optional for now?) + +    fatcat_file +        sha1hex: string, PK +        file_ident: string, index? +        release_ident: ? + +    petabox +        id: int64, PK +        sha1hex: string, notnull, index +        item: string, notnull +        path: string, notnull (TODO: URL encoded? separate sub-archive path?) + +    grobid +        sha1hex: string, PK +        updated: datetime +        grobid_version (string) +        status_code: i32 +        status: string (JSONB?), only if status != 200 +        metadata: JSONB, title, first author, year (not for now?) +        glutton_fatcat_release: string, index + +    shadow +        sha1hex: string, PK +        shadow_corpus: string, PK +        shadow_id: string +        doi: string +        pmid: string +        isbn13: string + +Alternatively, to be more like existing system could have "one big table" or +multiple tables all with same key (sha1b32) and UNIQ. As is, every sha1 pk +column is 40 bytes of both index and data, or 8+ GByte (combined) for each +table with 100 million rows. using raw bytes could help, but makes all +code/queries much trickier. + +Should we have "created" or "updated" timestamps on all these columns to enable +kafka tailing? + +TODO: +- how to indicate CDX sha1 vs. true sha1 mis-match? pretty rare. recrawl and delete row from `gwb_cdx`? +- only most recent GROBID? or keep multiple versions? here and minio + +## Existing Stuff Sizes (estimates) + +     78.5G  /user/bnewbold/journal_crawl_cdx +     19.7G  /user/bnewbold/sandcrawler/output-prod/2018-12-14-1737.00-dumpfilemeta +      2.7G  file_hashes.tsv +    228.5G  /user/bnewbold/sandcrawler/output-prod/2018-09-23-0405.30-dumpgrobidmetainsertable + +## Use Cases + +Core goal here is to mostly kill hbase/hadoop. What jobs are actually used there? + +- backfill: load in-scope (fulltext) crawl results from CDX +    => bulk (many line) inserts +- rowcount: "how many unique PDFs crawled?" +    => trivial SQL query +- status code count: "how much GROBID progress?" +    => trivial SQL query +- dumpungrobided: "what files still need to be processed" +    => SQL join with a "first" on CDX side +- dumpgrobidxml: "merge CDX/file info with extracted XML, for those that were successful" +    => SQL dump or rowscan, then minio fetches + +This table is generally "single file raw fulltext metadata". + +"Enrichment" jobs: + +- GROBID +- glutton (if not GROBID) +- extra file metadata +- match newly enriched files to fatcat + +What else? + +- track additional raw file metadata +- dump all basic GROBID metadata (title, authors, year) to attempt merge/match + +Questions we might want to answer + +- total size of PDF corpus (terabytes) +- unqiue files hit per domain + +## Prototype Plan + +- backfill all CDX crawl files (TSV transform?) +- load full GROBID XML (both into minio and into SQL) +- load full fatcat file dump (TSV transform) +- load dumpfilemeta + diff --git a/postgresql/sandcrawler_schema.sql b/postgresql/sandcrawler_schema.sql new file mode 100644 index 0000000..9e5651d --- /dev/null +++ b/postgresql/sandcrawler_schema.sql @@ -0,0 +1,57 @@ + +CREATE TABLE cdx ( +    id                  BIGSERIAL PRIMARY KEY, +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    cdx_sha1hex         TEXT CHECK (octet_length(cdx_sha1hex) = 40), +    url                 TEXT NOT NULL CHECK (octet_length(url) >= 1), +    datetime            TIMESTAMP WITH TIME ZONE NOT NULL, +    mimetype            TEXT CHECK (octet_length(mimetype) >= 1), +    warc_path           TEXT CHECK (octet_length(warc_path) >= 1), +    warc_offset         BIGINT, +    row_created         TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL +); +CREATE INDEX cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX cdx_row_created_idx ON cdx(row_created); + +CREATE TABLE file_meta ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    sha256hex           TEXT CHECK (octet_length(sha256hex) = 64), +    md5hex              TEXT CHECK (octet_length(md5hex) = 32), +    size_bytes          BIGINT, +    mimetype            TEXT CHECK (octet_length(mimetype) >= 1) +); + +CREATE TABLE fatcat_file ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    file_ident          TEXT CHECK (octet_length(file_ident) = 26), +    first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE petabox ( +    id                  BIGSERIAL PRIMARY KEY, +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    item                TEXT NOT NULL CHECK (octet_length(item) >= 1), +    path                TEXT NOT NULL CHECK (octet_length(path) >= 1) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE grobid ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    grobid_version      TEXT NOT NULL CHECK (octet_length(grobid_version) >= 1), +    status_code         INT NOT NULL, +    status              TEXT CHECK (octet_length(status) >= 1), +    fatcat_release      TEXT CHECK (octet_length(fatcat_release) = 26), +    metadata            JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE shadow ( +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    shadow_corpus       TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), +    shadow_id           TEXT CHECK (octet_length(shadow_id) >= 1), +    doi                 TEXT CHECK (octet_length(doi) >= 1), +    pmid                TEXT CHECK (octet_length(pmid) >= 1), +    isbn13              TEXT CHECK (octet_length(isbn13) >= 1), +    PRIMARY KEY(sha1hex, shadow_corpus) +); | 
