aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--postgresql/README.md120
-rw-r--r--postgresql/sandcrawler_schema.sql57
2 files changed, 177 insertions, 0 deletions
diff --git a/postgresql/README.md b/postgresql/README.md
new file mode 100644
index 0000000..4774adb
--- /dev/null
+++ b/postgresql/README.md
@@ -0,0 +1,120 @@
+
+TL;DR: replace hbase with postgresql tables with REST API (http://postgrest.org)
+
+No primary storage of anything in this table. Everything should be rapidly
+re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc.
+This is a secondary view on all of that.
+
+## Schema
+
+ schema/database name is 'sandcrawler'
+
+ cdx: include revisits or not?
+ id: int64, PK
+ sha1hex: string, not null, index
+ cdx_sha1hex: string
+ url: string, not null
+ datetime: ISO 8601:1988 (string?), not null
+ mimetype: string
+ warc_path: string (item and filename)
+ warc_offset: i64
+ created: datetime, index (?)
+ ?crawl: string
+ ?domain: string
+
+ file_meta
+ sha1hex, string, PK
+ md5hex: string
+ sha256hex: string
+ size_bytes: i64
+ mime: string (verifying file status; optional for now?)
+
+ fatcat_file
+ sha1hex: string, PK
+ file_ident: string, index?
+ release_ident: ?
+
+ petabox
+ id: int64, PK
+ sha1hex: string, notnull, index
+ item: string, notnull
+ path: string, notnull (TODO: URL encoded? separate sub-archive path?)
+
+ grobid
+ sha1hex: string, PK
+ updated: datetime
+ grobid_version (string)
+ status_code: i32
+ status: string (JSONB?), only if status != 200
+ metadata: JSONB, title, first author, year (not for now?)
+ glutton_fatcat_release: string, index
+
+ shadow
+ sha1hex: string, PK
+ shadow_corpus: string, PK
+ shadow_id: string
+ doi: string
+ pmid: string
+ isbn13: string
+
+Alternatively, to be more like existing system could have "one big table" or
+multiple tables all with same key (sha1b32) and UNIQ. As is, every sha1 pk
+column is 40 bytes of both index and data, or 8+ GByte (combined) for each
+table with 100 million rows. using raw bytes could help, but makes all
+code/queries much trickier.
+
+Should we have "created" or "updated" timestamps on all these columns to enable
+kafka tailing?
+
+TODO:
+- how to indicate CDX sha1 vs. true sha1 mis-match? pretty rare. recrawl and delete row from `gwb_cdx`?
+- only most recent GROBID? or keep multiple versions? here and minio
+
+## Existing Stuff Sizes (estimates)
+
+ 78.5G /user/bnewbold/journal_crawl_cdx
+ 19.7G /user/bnewbold/sandcrawler/output-prod/2018-12-14-1737.00-dumpfilemeta
+ 2.7G file_hashes.tsv
+ 228.5G /user/bnewbold/sandcrawler/output-prod/2018-09-23-0405.30-dumpgrobidmetainsertable
+
+## Use Cases
+
+Core goal here is to mostly kill hbase/hadoop. What jobs are actually used there?
+
+- backfill: load in-scope (fulltext) crawl results from CDX
+ => bulk (many line) inserts
+- rowcount: "how many unique PDFs crawled?"
+ => trivial SQL query
+- status code count: "how much GROBID progress?"
+ => trivial SQL query
+- dumpungrobided: "what files still need to be processed"
+ => SQL join with a "first" on CDX side
+- dumpgrobidxml: "merge CDX/file info with extracted XML, for those that were successful"
+ => SQL dump or rowscan, then minio fetches
+
+This table is generally "single file raw fulltext metadata".
+
+"Enrichment" jobs:
+
+- GROBID
+- glutton (if not GROBID)
+- extra file metadata
+- match newly enriched files to fatcat
+
+What else?
+
+- track additional raw file metadata
+- dump all basic GROBID metadata (title, authors, year) to attempt merge/match
+
+Questions we might want to answer
+
+- total size of PDF corpus (terabytes)
+- unqiue files hit per domain
+
+## Prototype Plan
+
+- backfill all CDX crawl files (TSV transform?)
+- load full GROBID XML (both into minio and into SQL)
+- load full fatcat file dump (TSV transform)
+- load dumpfilemeta
+
diff --git a/postgresql/sandcrawler_schema.sql b/postgresql/sandcrawler_schema.sql
new file mode 100644
index 0000000..9e5651d
--- /dev/null
+++ b/postgresql/sandcrawler_schema.sql
@@ -0,0 +1,57 @@
+
+CREATE TABLE cdx (
+ id BIGSERIAL PRIMARY KEY,
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TIMESTAMP WITH TIME ZONE NOT NULL,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL
+);
+CREATE INDEX cdx_sha1hex_idx ON cdx(sha1hex);
+CREATE INDEX cdx_row_created_idx ON cdx(row_created);
+
+CREATE TABLE file_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+
+CREATE TABLE fatcat_file (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ file_ident TEXT CHECK (octet_length(file_ident) = 26),
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+);
+
+CREATE TABLE petabox (
+ id BIGSERIAL PRIMARY KEY,
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ item TEXT NOT NULL CHECK (octet_length(item) >= 1),
+ path TEXT NOT NULL CHECK (octet_length(path) >= 1)
+);
+CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
+
+CREATE TABLE grobid (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ grobid_version TEXT NOT NULL CHECK (octet_length(grobid_version) >= 1),
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
+ metadata JSONB
+);
+-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+
+CREATE TABLE shadow (
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT CHECK (octet_length(shadow_id) >= 1),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(sha1hex, shadow_corpus)
+);