diff options
Diffstat (limited to 'sql')
44 files changed, 4931 insertions, 0 deletions
diff --git a/sql/README.md b/sql/README.md new file mode 100644 index 0000000..1d53d6d --- /dev/null +++ b/sql/README.md @@ -0,0 +1,160 @@ + +TL;DR: replace hbase with postgresql tables with REST API (http://postgrest.org) + +No primary storage of anything in this table. Everything should be rapidly +re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc. +This is a secondary view on all of that. + +## Create Database and User + +Create system user with your username like: + + sudo su postgres + createuser -s bnewbold + +Create database using `diesel` tool (see fatcat rust docs for install notes): + + # DANGER: will delete/recreate entire database + diesel database reset + +In the future would probably be better to create a real role/password and +supply these via `DATABASE_URL` env variable. + +## Schema + + schema/database name is 'sandcrawler' + + cdx: include revisits or not? + id: int64, PK + sha1hex: string, not null, index + cdx_sha1hex: string + url: string, not null + datetime: ISO 8601:1988 (string?), not null + mimetype: string + warc_path: string (item and filename) + warc_offset: i64 + created: datetime, index (?) + ?crawl: string + ?domain: string + + file_meta + sha1hex, string, PK + md5hex: string + sha256hex: string + size_bytes: i64 + mime: string (verifying file status; optional for now?) + + fatcat_file + sha1hex: string, PK + file_ident: string, index? + release_ident: ? + + petabox + id: int64, PK + sha1hex: string, notnull, index + item: string, notnull + path: string, notnull (TODO: URL encoded? separate sub-archive path?) + + grobid + sha1hex: string, PK + updated: datetime + grobid_version (string) + status_code: i32 + status: string (JSONB?), only if status != 200 + metadata: JSONB, title, first author, year (not for now?) + glutton_fatcat_release: string, index + + shadow + sha1hex: string, PK + shadow_corpus: string, PK + shadow_id: string + doi: string + pmid: string + isbn13: string + +Alternatively, to be more like existing system could have "one big table" or +multiple tables all with same key (sha1b32) and UNIQ. As is, every sha1 pk +column is 40 bytes of both index and data, or 8+ GByte (combined) for each +table with 100 million rows. using raw bytes could help, but makes all +code/queries much trickier. + +Should we have "created" or "updated" timestamps on all these columns to enable +kafka tailing? + +TODO: +- how to indicate CDX sha1 vs. true sha1 mis-match? pretty rare. recrawl and delete row from `gwb_cdx`? +- only most recent GROBID? or keep multiple versions? here and minio + +## Existing Stuff Sizes (estimates) + + 78.5G /user/bnewbold/journal_crawl_cdx + 19.7G /user/bnewbold/sandcrawler/output-prod/2018-12-14-1737.00-dumpfilemeta + 2.7G file_hashes.tsv + 228.5G /user/bnewbold/sandcrawler/output-prod/2018-09-23-0405.30-dumpgrobidmetainsertable + +## Use Cases + +Core goal here is to mostly kill hbase/hadoop. What jobs are actually used there? + +- backfill: load in-scope (fulltext) crawl results from CDX + => bulk (many line) inserts +- rowcount: "how many unique PDFs crawled?" + => trivial SQL query +- status code count: "how much GROBID progress?" + => trivial SQL query +- dumpungrobided: "what files still need to be processed" + => SQL join with a "first" on CDX side +- dumpgrobidxml: "merge CDX/file info with extracted XML, for those that were successful" + => SQL dump or rowscan, then minio fetches + +This table is generally "single file raw fulltext metadata". + +"Enrichment" jobs: + +- GROBID +- glutton (if not GROBID) +- extra file metadata +- match newly enriched files to fatcat + +What else? + +- track additional raw file metadata +- dump all basic GROBID metadata (title, authors, year) to attempt merge/match + +Questions we might want to answer + +- total size of PDF corpus (terabytes) +- unqiue files hit per domain + +## Prototype Plan + +- backfill all CDX crawl files (TSV transform?) +- load full GROBID XML (both into minio and into SQL) +- load full fatcat file dump (TSV transform) +- load dumpfilemeta + +## Example Useful Lookups + + + http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES + http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 + + +## Full SQL Database Dumps + +Run a dump in compressed, postgres custom format: + + export DATESLUG="`date +%Y-%m-%d.%H%M%S`" + time sudo -u postgres pg_dump --verbose --format=custom sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump + +As of 2021-04-07, this process runs for about 4 hours and the compressed +snapshot is 88 GBytes (compared with 551.34G database disk consumption). + +To restore a dump (which will delete local database content, if any): + + sudo su postgres + createuser --no-login web_anon + createuser -s sandcrawler + time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump + +Took about 2.5 hours. diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md new file mode 100644 index 0000000..f1a5f86 --- /dev/null +++ b/sql/backfill/backfill.md @@ -0,0 +1,135 @@ + +SQL Backfill Notes +----------------------- + +GROBID is going to be somewhat complex. + +TODO: +x CDX backfill script (CDX to postgresql direct, bulk inserts, python) +x `file_meta` bulk insert (TSV to postgresql direct, bulk upserts, python) +x GROBID insert (python, dump TSV to minio then postgresql) + +## `cdx` + + #cat example.cdx | rg ' 200 ' | cut -d' ' -f2,3,4,6,9,10,11 + #cat example.cdx | rg ' 200 ' | awk '{print $6 "\t" $3 "\t" $2 "\t" $4 "\t\t" $6 "\t" $11 "\t" $9 "\t" $10}' | b32_hex.py | awk '{print $2 "\t" $3 "\t" $4 "\t" $1 "\t" $6 "\t" $7 "\t" $8}' > cdx.example.tsv + cat example.cdx | ./filter_transform_cdx.py > cdx.example.tsv + + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.example.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + +Big HDFS import: + + # but actually didn't import those; don't want to need to re-import + hdfs dfs -get journal_crawl_cdx/* + + cat citeseerx_crawl_2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.citeseerx_crawl_2017.tsv + cat gwb-pdf-20171227034923-surt-filter/* | rg ' 200 ' | ./filter_transform_cdx.py > gwb-pdf-20171227034923-surt-filter.tsv + cat UNPAYWALL-PDF-CRAWL-2018-07.filtered.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.UNPAYWALL-PDF-CRAWL-2018-07.filtered.tsv + cat MSAG-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.MSAG-PDF-CRAWL-2017.tsv + + cat CORE-UPSTREAM-CRAWL-2018-11.sorted.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.CORE-UPSTREAM-CRAWL-2018-11.sorted.tsv + cat DIRECT-OA-CRAWL-2019.pdfs.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.DIRECT-OA-CRAWL-2019.pdfs.tsv + cat DOI-LANDING-CRAWL-2018-06.200_pdf.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.DOI-LANDING-CRAWL-2018-06.200_pdf.tsv + cat OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.tsv + cat SEMSCHOLAR-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.SEMSCHOLAR-PDF-CRAWL-2017.tsv + cat TARGETED-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.TARGETED-PDF-CRAWL-2017.tsv + cat UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.tsv + +TODO: nasty escaping? + +In psql: + + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.UNPAYWALL-PDF-CRAWL-2018-07.filtered.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # ERROR + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.MSAG-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # ERROR + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.citeseerx_crawl_2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 1653840 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.CORE-UPSTREAM-CRAWL-2018-11.sorted.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 2827563 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.DIRECT-OA-CRAWL-2019.pdfs.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 10651736 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.DOI-LANDING-CRAWL-2018-06.200_pdf.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 768565 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 5310017 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.SEMSCHOLAR-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 2219839 + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.TARGETED-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # ERROR + COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # ERROR + +NOTE: these largely didn't work; will need to write a batch importer. + +Batch import process: + + cat UNPAYWALL-PDF-CRAWL-2018-07.filtered.cdx MSAG-PDF-CRAWL-2017.cdx TARGETED-PDF-CRAWL-2017.cdx UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.cdx | ./backfill_cdx.py + # Done: Counter({'raw_lines': 123254127, 'total': 51365599, 'batches': 51365}) + +## `fatcat_file` + + zcat file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0]] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv + +In psql: + + COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # => COPY 24727350 + +## `file_meta` + + zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv + +In psql: + + COPY file_meta FROM '/sandcrawler-db/backfill/file_meta.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # -> COPY 5860092 + +## `petabox` + + zcat /fast/download/file_export.2019-07-07.json.gz | rg '//archive.org/' | pigz > /fast/download/file_export.2019-07-07.petabox.json.gz + zcat /fast/download/file_export.2019-07-07.petabox.json.gz | ./petabox_transform.py | sort -u -S 8G | awk '{print $3 "\t" $1 "\t" $2}' | uniq -s40 | awk '{print $2 "\t" $3 "\t" $1}' > petabox.fatcat_2019-07-07.tsv + +In psql: + + COPY petabox FROM '/sandcrawler-db/backfill/petabox.fatcat_2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # -> COPY 2887834 + +## `grobid` + +Quick test: + + zcat /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part-00000.gz | cut -f2 | head | ./backfill_grobid.py + +Run big batch: + + ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | cut -f2 | ./backfill_grobid.py' + # [...] + # Done: Counter({'minio-success': 161605, 'total': 161605, 'raw_lines': 161605, 'batches': 161}) + # [...] + +Was running slow with lots of iowait and 99% jdb2. This seems to be disk I/O. Going to try: + + sudo mount /dev/sdc1 /sandcrawler-minio/ -o data=writeback,noatime,nobarrier + + # -j8: 20+ M/s write, little jdb2 + # -j16: 30+ M/s write, little jdb2 + # -j12: 30+ M/s write, going with this + +For general use should go back to: + + sudo mount /dev/sdc1 /sandcrawler-minio/ -o data=noatime + + # -j4: Still pretty slow, only ~3-5 M/s disk write. jbd2 consistently at 99%, 360 K/s write + +## rough table sizes + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."cdx" | 11 GB | 8940 MB | 20 GB + "public"."shadow" | 8303 MB | 7205 MB | 15 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."file_meta" | 814 MB | 382 MB | 1196 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + [...] + diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py new file mode 100755 index 0000000..f929502 --- /dev/null +++ b/sql/backfill/backfill_cdx.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for CDX backfill into sandcrawler +postgresql. + +Most of this file was copied from '../python/common.py'. +""" + +import json, os, sys, collections +import base64 +import psycopg2 +import psycopg2.extras + +NORMAL_MIME = ( + 'application/pdf', + 'application/postscript', + 'text/html', + 'text/xml', +) + +def normalize_mime(raw): + raw = raw.lower() + for norm in NORMAL_MIME: + if raw.startswith(norm): + return norm + + # Special cases + if raw.startswith('application/xml'): + return 'text/xml' + if raw.startswith('application/x-pdf'): + return 'application/pdf' + return None + + +def test_normalize_mime(): + assert normalize_mime("asdf") is None + assert normalize_mime("application/pdf") == "application/pdf" + assert normalize_mime("application/pdf+journal") == "application/pdf" + assert normalize_mime("Application/PDF") == "application/pdf" + assert normalize_mime("application/p") is None + assert normalize_mime("application/xml+stuff") == "text/xml" + assert normalize_mime("application/x-pdf") == "application/pdf" + assert normalize_mime("application/x-html") is None + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +def parse_cdx_line(raw_cdx): + + cdx = raw_cdx.split() + if len(cdx) < 11: + return None + + surt = cdx[0] + dt = cdx[1] + url = cdx[2] + mime = normalize_mime(cdx[3]) + http_status = cdx[4] + key = cdx[5] + c_size = cdx[8] + offset = cdx[9] + warc = cdx[10] + + if not (key.isalnum() and c_size.isdigit() and offset.isdigit() + and http_status == "200" and len(key) == 32 and dt.isdigit() + and mime != None): + return None + + if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): + return None + + # these are the new/specific bits + sha1 = b32_hex(key) + return dict(url=url, datetime=dt, sha1hex=sha1, cdx_sha1hex=None, mimetype=mime, warc_path=warc, warc_csize=int(c_size), warc_offset=int(offset)) + +def insert(cur, batch): + sql = """ + INSERT INTO + cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) + VALUES %s + ON CONFLICT ON CONSTRAINT cdx_pkey DO NOTHING + RETURNING 1; + """ + batch = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'], + d['warc_path'], d['warc_csize'], d['warc_offset']) + for d in batch] + res = psycopg2.extras.execute_values(cur, sql, batch) # fetch=True + #return len(res) + +def stdin_to_pg(): + # no host means it will use local domain socket by default + conn = psycopg2.connect(database="sandcrawler", user="postgres") + cur = conn.cursor() + counts = collections.Counter({'total': 0}) + batch = [] + for l in sys.stdin: + l = l.strip() + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + if not l: + continue + info = parse_cdx_line(l) + if not info: + continue + # XXX: filter to, eg, PDF or octet/stream (derp) + batch.append(info) + counts['total'] += 1 + if len(batch) >= 1000: + insert(cur, batch) + conn.commit() + #counts['inserted'] += i + #counts['existing'] += len(batch) - i + batch = [] + counts['batches'] += 1 + if batch: + insert(cur, batch) + #counts['inserted'] += i + #counts['existing'] += len(batch) - i + batch = [] + conn.commit() + cur.close() + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_pg() diff --git a/sql/backfill/backfill_file_meta.py b/sql/backfill/backfill_file_meta.py new file mode 100755 index 0000000..e3b40a0 --- /dev/null +++ b/sql/backfill/backfill_file_meta.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for file_meta backfill into +sandcrawler postgresql. + +Most of this file was copied from '../python/common.py'. +""" + +import json, os, sys, collections +import psycopg2 +import psycopg2.extras + + +def insert(cur, batch): + sql = """ + INSERT INTO + file_meta + VALUES %s + ON CONFLICT DO NOTHING; + """ + res = psycopg2.extras.execute_values(cur, sql, batch) + +def stdin_to_pg(): + # no host means it will use local domain socket by default + conn = psycopg2.connect(database="sandcrawler", user="postgres") + cur = conn.cursor() + counts = collections.Counter({'total': 0}) + batch = [] + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + if not l.strip(): + continue + info = l.split("\t") + if not info: + continue + assert len(info) == 5 + info[-1] = info[-1].strip() or None + batch.append(info) + counts['total'] += 1 + if len(batch) >= 1000: + insert(cur, batch) + conn.commit() + batch = [] + counts['batches'] += 1 + if batch: + insert(cur, batch) + batch = [] + conn.commit() + cur.close() + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_pg() diff --git a/sql/backfill/backfill_grobid.py b/sql/backfill/backfill_grobid.py new file mode 100755 index 0000000..08fad7f --- /dev/null +++ b/sql/backfill/backfill_grobid.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for GROBID backfill into +sandcrawler minio and postgresql. +""" + +import json, os, sys, collections, io +import base64 +import requests +from minio import Minio +import psycopg2 +import psycopg2.extras + + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def insert(cur, batch): + sql = """ + INSERT INTO + grobid (sha1hex, grobid_version, status_code, status, fatcat_release, metadata) + VALUES %s + ON CONFLICT DO NOTHING; + """ + batch = [(d['sha1hex'], d['grobid_version'], d['status_code'], d['status'], d['fatcat_release'], d['metadata']) + for d in batch] + res = psycopg2.extras.execute_values(cur, sql, batch) + +def stdin_to_pg(): + mc = Minio('localhost:9000', + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + secure=False) + # no host means it will use local domain socket by default + conn = psycopg2.connect(database="sandcrawler", user="postgres") + cur = conn.cursor() + counts = collections.Counter({'total': 0}) + batch = [] + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + l = l.strip() + if not l: + continue + row = json.loads(l) + if not row: + continue + sha1hex = b32_hex(row['pdf_hash']) + grobid_xml = row['tei_xml'].encode('utf-8') + grobid_xml_len = len(grobid_xml) + grobid_xml = io.BytesIO(grobid_xml) + + key = "{}/{}/{}.tei.xml".format( + sha1hex[0:2], + sha1hex[2:4], + sha1hex) + mc.put_object("grobid", key, grobid_xml, grobid_xml_len, + content_type="application/tei+xml", + metadata=None) + counts['minio-success'] += 1 + + info = dict( + sha1hex=sha1hex, + grobid_version=None, # TODO + status_code=200, + status=None, + fatcat_release=None, + metadata=None, + ) + batch.append(info) + counts['total'] += 1 + if len(batch) >= 1000: + insert(cur, batch) + conn.commit() + batch = [] + counts['batches'] += 1 + if batch: + insert(cur, batch) + batch = [] + conn.commit() + cur.close() + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_pg() diff --git a/sql/backfill/backfill_grobid_unpaywall.py b/sql/backfill/backfill_grobid_unpaywall.py new file mode 100755 index 0000000..58e9e3c --- /dev/null +++ b/sql/backfill/backfill_grobid_unpaywall.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for GROBID backfill into +sandcrawler minio and postgresql. + +This variant of backfill_grobid.py pushes into the unpaywall bucket of +sandcrawler-minio and doesn't push anything to sandcrawler table in general. +""" + +import json, os, sys, collections, io +import base64 +import requests +from minio import Minio +import psycopg2 +import psycopg2.extras + + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def stdin_to_minio(): + mc = Minio('localhost:9000', + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + secure=False) + counts = collections.Counter({'total': 0}) + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + l = l.strip() + if not l: + continue + row = json.loads(l) + if not row: + continue + sha1hex = b32_hex(row['pdf_hash']) + grobid_xml = row['tei_xml'].encode('utf-8') + grobid_xml_len = len(grobid_xml) + grobid_xml = io.BytesIO(grobid_xml) + + key = "grobid/{}/{}/{}.tei.xml".format( + sha1hex[0:2], + sha1hex[2:4], + sha1hex) + mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len, + content_type="application/tei+xml", + metadata=None) + counts['minio-success'] += 1 + + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_minio() diff --git a/sql/backfill/filter_transform_cdx.py b/sql/backfill/filter_transform_cdx.py new file mode 100755 index 0000000..3507dfc --- /dev/null +++ b/sql/backfill/filter_transform_cdx.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for CDX backfill into sandcrawler +postgresql. + +Most of this file was copied from '../python/common.py'. +""" + +import json, os, sys +import base64 + +NORMAL_MIME = ( + 'application/pdf', + 'application/postscript', + 'text/html', + 'text/xml', +) + +def normalize_mime(raw): + raw = raw.lower() + for norm in NORMAL_MIME: + if raw.startswith(norm): + return norm + + # Special cases + if raw.startswith('application/xml'): + return 'text/xml' + if raw.startswith('application/x-pdf'): + return 'application/pdf' + return None + + +def test_normalize_mime(): + assert normalize_mime("asdf") is None + assert normalize_mime("application/pdf") == "application/pdf" + assert normalize_mime("application/pdf+journal") == "application/pdf" + assert normalize_mime("Application/PDF") == "application/pdf" + assert normalize_mime("application/p") is None + assert normalize_mime("application/xml+stuff") == "text/xml" + assert normalize_mime("application/x-pdf") == "application/pdf" + assert normalize_mime("application/x-html") is None + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +def parse_cdx_line(raw_cdx): + + cdx = raw_cdx.split() + if len(cdx) < 11: + return None + + surt = cdx[0] + dt = cdx[1] + url = cdx[2] + mime = normalize_mime(cdx[3]) + http_status = cdx[4] + key = cdx[5] + c_size = cdx[8] + offset = cdx[9] + warc = cdx[10] + + if not (key.isalnum() and c_size.isdigit() and offset.isdigit() + and http_status == "200" and len(key) == 32 and dt.isdigit() + and mime != None): + return None + + if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): + return None + + # these are the new/specific bits + sha1 = b32_hex(key) + return dict(url=url, datetime=dt, sha1hex=sha1, cdx_sha1hex=None, mimetype=mime, warc_path=warc, warc_csize=int(c_size), warc_offset=int(offset)) + +for l in sys.stdin: + l = l.strip() + if not l: + continue + info = parse_cdx_line(l) + if not info: + continue + print("\t".join([info['url'], info['datetime'], info['sha1hex'], info['mimetype'], info['warc_path'], str(info['warc_csize']), str(info['warc_offset'])])) + diff --git a/sql/backfill/petabox_transform.py b/sql/backfill/petabox_transform.py new file mode 100755 index 0000000..b638911 --- /dev/null +++ b/sql/backfill/petabox_transform.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import json, sys, os + +for l in sys.stdin.readlines(): + l = l.strip() + if not l: + continue + r = json.loads(l) + if not r['sha1']: + continue + sha1hex = r['sha1'] + for url in r['urls']: + u = url['url'] + if not '//archive.org/' in u: + continue + u = u.split('/') + if u[2] == 'web.archive.org': + continue + #print(u) + assert u[2] == 'archive.org' and u[3] in ('download', 'serve') + item = u[4] + path = '/'.join(u[5:]) + print("\t".join([item, path, sha1hex])) diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql new file mode 100644 index 0000000..a7d6c2b --- /dev/null +++ b/sql/dump_file_meta.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + ORDER BY sha1hex ASC +) +TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_regrobid_pdf.sql b/sql/dump_regrobid_pdf.sql new file mode 100644 index 0000000..b846834 --- /dev/null +++ b/sql/dump_regrobid_pdf.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) +) +TO STDOUT +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql new file mode 100644 index 0000000..e7c48f3 --- /dev/null +++ b/sql/dump_regrobid_pdf_petabox.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_regrobid_pdf_petabox.sql +-- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox + WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) +) +TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql new file mode 100644 index 0000000..917d88b --- /dev/null +++ b/sql/dump_reingest_quarterly.sql @@ -0,0 +1,31 @@ + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'wayback-error' + OR ingest_file_result.status like 'wayback-content-error' + OR ingest_file_result.status like 'petabox-error' + OR ingest_file_result.status like 'gateway-timeout' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql new file mode 100644 index 0000000..a8ed72f --- /dev/null +++ b/sql/dump_reingest_spn.sql @@ -0,0 +1,25 @@ + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '2 hour'::INTERVAL + AND ingest_request.created > NOW() - '31 day'::INTERVAL + AND ingest_request.ingest_request_source = 'savepapernow-web' + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status like 'cdx-error' + -- OR ingest_file_result.status like 'wayback-error' + -- OR ingest_file_result.status like 'wayback-content-error' + OR ingest_file_result.status like 'petabox-error' + -- OR ingest_file_result.status like 'gateway-timeout' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql new file mode 100644 index 0000000..65800eb --- /dev/null +++ b/sql/dump_reingest_weekly.sql @@ -0,0 +1,31 @@ + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '8 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status like 'cdx-error' + -- OR ingest_file_result.status like 'wayback-error' + -- OR ingest_file_result.status like 'wayback-content-error' + OR ingest_file_result.status like 'petabox-error' + -- OR ingest_file_result.status like 'gateway-timeout' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql new file mode 100644 index 0000000..a7fb920 --- /dev/null +++ b/sql/dump_unextracted_pdf.sql @@ -0,0 +1,22 @@ + +-- Run like: +-- psql sandcrawler < dump_unextracted_pdf.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL +) +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql new file mode 100644 index 0000000..bb9f162 --- /dev/null +++ b/sql/dump_unextracted_pdf_petabox.sql @@ -0,0 +1,18 @@ + +-- Run like: +-- psql sandcrawler < dump_unextracted_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL +) +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql new file mode 100644 index 0000000..81caf18 --- /dev/null +++ b/sql/dump_ungrobid_pdf.sql @@ -0,0 +1,18 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) +) +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql new file mode 100644 index 0000000..b7a1db2 --- /dev/null +++ b/sql/dump_ungrobid_pdf_petabox.sql @@ -0,0 +1,17 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM petabox + WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) +) +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql new file mode 100644 index 0000000..333ff7b --- /dev/null +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -0,0 +1,19 @@ + +-- Run like: +-- psql sandcrawler < THING.sql > THING.2019-09-23.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(grobid) + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL + LIMIT 1000 +) +TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; +--TO STDOUT +--WITH NULL ''; + +ROLLBACK; diff --git a/sql/example.env b/sql/example.env new file mode 100644 index 0000000..3a13689 --- /dev/null +++ b/sql/example.env @@ -0,0 +1 @@ +DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler" diff --git a/sql/ingest_again.md b/sql/ingest_again.md new file mode 100644 index 0000000..b749557 --- /dev/null +++ b/sql/ingest_again.md @@ -0,0 +1,158 @@ + +## re-ingest some broken + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-%' + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' + ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source != 'fatcat-changelog' + AND ingest_request.ingest_request_source != 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'wayback-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'gateway-timeout' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'petabox-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json'; + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json + +Push to kafka (shuffled): + + cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + + cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0 + + cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Push to kafka (not shuffled): + + cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## just recent fatcat-ingest + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL + AND ingest_file_result.hit = false + AND (ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'gateway-timeout' + OR ingest_file_result.status like 'wayback-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json'; + + # note: shuf + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json + + cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## specific domains + +protocols.io: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url LIKE '%10.17504/protocols.io%' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +biorxiv/medrxiv: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url LIKE '%10.1101/20%' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt new file mode 100644 index 0000000..b684400 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt @@ -0,0 +1,326 @@ + doi_prefix | status | count +------------+-------------------------------+-------- + 10.1001 | | 230 + 10.1002 | | 3914 + 10.1002 | terminal-bad-status | 1540 + 10.1002 | forbidden | 1072 + 10.1002 | redirect-loop | 995 + 10.1002 | no-pdf-link | 210 + 10.1016 | | 7976 + 10.1016 | no-pdf-link | 4648 + 10.1016 | terminal-bad-status | 1778 + 10.1016 | forbidden | 622 + 10.1016 | spn2-error:too-many-redirects | 344 + 10.1016 | redirect-loop | 225 + 10.1017 | | 2040 + 10.1017 | no-pdf-link | 720 + 10.1017 | success | 441 + 10.1017 | link-loop | 371 + 10.1017 | bad-redirect | 227 + 10.1021 | | 1722 + 10.1021 | blocked-cookie | 1552 + 10.1029 | | 248 + 10.1039 | | 1160 + 10.1039 | redirect-loop | 486 + 10.1039 | spn2-error:too-many-redirects | 395 + 10.1039 | spn2-wayback-error | 213 + 10.1051 | | 695 + 10.1051 | success | 557 + 10.1055 | | 541 + 10.1055 | not-found | 295 + 10.1055 | redirect-loop | 213 + 10.1057 | | 2835 + 10.1057 | redirect-loop | 2617 + 10.1061 | | 550 + 10.1061 | spn2-error:too-many-redirects | 425 + 10.1063 | | 600 + 10.1063 | spn2-error:too-many-redirects | 328 + 10.1080 | | 3801 + 10.1080 | blocked-cookie | 2431 + 10.1080 | terminal-bad-status | 711 + 10.1080 | forbidden | 341 + 10.1081 | | 299 + 10.1081 | link-loop | 222 + 10.1089 | | 236 + 10.1089 | blocked-cookie | 228 + 10.1093 | | 12805 + 10.1093 | link-loop | 8627 + 10.1093 | redirect-loop | 1659 + 10.1093 | no-pdf-link | 1475 + 10.1093 | bad-redirect | 428 + 10.1093 | success | 391 + 10.1097 | | 1497 + 10.1097 | no-pdf-link | 503 + 10.1097 | link-loop | 346 + 10.1097 | spn2-error:too-many-redirects | 259 + 10.1097 | terminal-bad-status | 202 + 10.1101 | | 1859 + 10.1101 | redirect-loop | 993 + 10.1101 | forbidden | 703 + 10.1103 | | 597 + 10.1103 | not-found | 534 + 10.1108 | | 1055 + 10.1108 | no-pdf-link | 945 + 10.1109 | | 7067 + 10.1109 | spn2-error:too-many-redirects | 6299 + 10.1109 | success | 667 + 10.1111 | | 2099 + 10.1111 | redirect-loop | 1331 + 10.1111 | terminal-bad-status | 313 + 10.1111 | forbidden | 226 + 10.1115 | | 1278 + 10.1115 | bad-redirect | 707 + 10.1117 | | 561 + 10.1117 | spn2-error:too-many-redirects | 501 + 10.1126 | | 214 + 10.1136 | | 1989 + 10.1136 | success | 1463 + 10.1136 | link-loop | 294 + 10.1142 | | 300 + 10.1142 | blocked-cookie | 237 + 10.1145 | | 440 + 10.1145 | blocked-cookie | 354 + 10.1155 | | 480 + 10.1155 | success | 474 + 10.11588 | | 506 + 10.11588 | no-pdf-link | 264 + 10.11588 | success | 236 + 10.1159 | | 226 + 10.11606 | | 304 + 10.1161 | | 1142 + 10.1161 | blocked-cookie | 1011 + 10.1163 | | 2261 + 10.1163 | link-loop | 1767 + 10.1163 | success | 348 + 10.11648 | | 405 + 10.11648 | success | 404 + 10.1182 | | 2125 + 10.1182 | no-pdf-link | 2024 + 10.1183 | | 987 + 10.1183 | redirect-loop | 838 + 10.1186 | | 1481 + 10.1186 | success | 1412 + 10.1201 | | 7649 + 10.1201 | link-loop | 5383 + 10.1201 | forbidden | 1504 + 10.1201 | no-pdf-link | 312 + 10.1299 | | 264 + 10.1299 | no-pdf-link | 209 + 10.13134 | | 201 + 10.1353 | | 549 + 10.1353 | terminal-bad-status | 443 + 10.1371 | | 552 + 10.1371 | success | 542 + 10.14201 | | 656 + 10.14201 | success | 366 + 10.14361 | | 647 + 10.14361 | link-loop | 585 + 10.14746 | | 260 + 10.14746 | success | 232 + 10.1504 | | 527 + 10.1504 | no-pdf-link | 501 + 10.15122 | | 246 + 10.15122 | success | 243 + 10.1515 | | 16240 + 10.1515 | link-loop | 12589 + 10.1515 | success | 1941 + 10.1515 | no-pdf-link | 1008 + 10.1515 | not-found | 283 + 10.15405 | | 229 + 10.15405 | success | 218 + 10.1553 | | 418 + 10.1553 | no-pdf-link | 396 + 10.1590 | | 655 + 10.1590 | success | 623 + 10.17104 | | 1202 + 10.17104 | no-pdf-link | 953 + 10.17104 | bad-redirect | 249 + 10.17605 | | 368 + 10.17605 | not-found | 337 + 10.17615 | | 9401 + 10.17615 | redirect-loop | 5720 + 10.17615 | spn2-wayback-error | 3099 + 10.17615 | spn2-cdx-lookup-failure | 201 + 10.17863 | | 438 + 10.18148 | | 465 + 10.18148 | success | 462 + 10.18720 | | 210 + 10.18821 | | 476 + 10.18821 | redirect-loop | 366 + 10.20345 | | 222 + 10.20345 | terminal-bad-status | 215 + 10.20546 | | 244 + 10.20546 | no-pdf-link | 241 + 10.21037 | | 232 + 10.2118 | | 903 + 10.2118 | redirect-loop | 853 + 10.21203 | | 1824 + 10.21203 | success | 1545 + 10.2139 | | 1493 + 10.2139 | link-loop | 1145 + 10.2147 | | 318 + 10.2147 | success | 267 + 10.2172 | | 282 + 10.2174 | | 363 + 10.2174 | no-pdf-link | 320 + 10.2196 | | 265 + 10.2208 | | 299 + 10.22215 | | 218 + 10.22215 | success | 217 + 10.22323 | | 289 + 10.22323 | success | 262 + 10.22533 | | 395 + 10.22533 | success | 393 + 10.22541 | | 291 + 10.22541 | success | 275 + 10.23919 | | 426 + 10.23919 | spn2-error:too-many-redirects | 403 + 10.24034 | | 319 + 10.24034 | spn2-error | 203 + 10.24355 | | 15360 + 10.24355 | no-pdf-link | 15228 + 10.24411 | | 1506 + 10.24411 | forbidden | 823 + 10.24411 | redirect-loop | 647 + 10.25335 | | 550 + 10.25335 | no-pdf-link | 550 + 10.25365 | | 429 + 10.25365 | success | 424 + 10.25384 | | 338 + 10.25384 | success | 249 + 10.25646 | | 239 + 10.26197 | no-pdf-link | 303 + 10.26197 | | 303 + 10.26226 | | 272 + 10.26278 | | 1291 + 10.26278 | redirect-loop | 756 + 10.26278 | spn2-error:too-many-redirects | 509 + 10.29327 | | 232 + 10.2991 | | 307 + 10.2991 | spn2-wayback-error | 227 + 10.30965 | | 722 + 10.30965 | link-loop | 709 + 10.3109 | | 801 + 10.3109 | link-loop | 572 + 10.3109 | forbidden | 228 + 10.31219 | | 951 + 10.31219 | redirect-loop | 518 + 10.31219 | spn2-wayback-error | 356 + 10.31274 | | 296 + 10.31743 | | 403 + 10.31743 | success | 294 + 10.31857 | | 209 + 10.3233 | | 471 + 10.33448 | | 213 + 10.33448 | success | 212 + 10.3389 | | 1459 + 10.3389 | success | 1417 + 10.3390 | | 4511 + 10.3390 | success | 3577 + 10.3390 | terminal-bad-status | 485 + 10.3390 | forbidden | 379 + 10.3406 | | 243 + 10.3406 | terminal-bad-status | 213 + 10.34944 | | 527 + 10.34944 | success | 459 + 10.35016 | | 688 + 10.35016 | no-pdf-link | 687 + 10.36347 | success | 213 + 10.36347 | | 213 + 10.37747 | | 213 + 10.37747 | no-pdf-link | 213 + 10.37904 | | 227 + 10.37904 | no-pdf-link | 226 + 10.3917 | | 347 + 10.3917 | redirect-loop | 208 + 10.3923 | | 356 + 10.3923 | redirect-loop | 254 + 10.3929 | | 317 + 10.3929 | terminal-bad-status | 310 + 10.3931 | | 279 + 10.3931 | no-pdf-link | 279 + 10.4000 | | 7828 + 10.4000 | success | 3485 + 10.4000 | spn2-wayback-error | 2142 + 10.4000 | redirect-loop | 2106 + 10.4018 | | 249 + 10.4018 | not-found | 240 + 10.4103 | | 726 + 10.4103 | remote-server-error | 343 + 10.4103 | redirect-loop | 324 + 10.4159 | | 286 + 10.4159 | link-loop | 238 + 10.4324 | | 19398 + 10.4324 | link-loop | 12471 + 10.4324 | forbidden | 3632 + 10.4324 | not-found | 2283 + 10.4324 | terminal-bad-status | 645 + 10.4324 | success | 208 + 10.47295 | | 456 + 10.47295 | success | 449 + 10.47513 | | 218 + 10.47513 | no-pdf-link | 203 + 10.48084 | success | 538 + 10.48084 | | 538 + 10.5040 | | 375 + 10.5040 | no-pdf-link | 365 + 10.5167 | | 290 + 10.5167 | redirect-loop | 278 + 10.5169 | | 360 + 10.5169 | no-pdf-link | 355 + 10.5194 | | 917 + 10.5194 | success | 887 + 10.5216 | | 213 + 10.5220 | no-pdf-link | 397 + 10.5220 | | 397 + 10.5281 | | 22551 + 10.5281 | terminal-bad-status | 12158 + 10.5281 | success | 4901 + 10.5281 | no-pdf-link | 4754 + 10.5281 | spn2-error:unknown | 360 + 10.5282 | | 228 + 10.5451 | | 2068 + 10.5451 | success | 1071 + 10.5451 | terminal-bad-status | 817 + 10.5753 | | 268 + 10.5753 | success | 264 + 10.5771 | | 941 + 10.5771 | no-pdf-link | 397 + 10.5771 | bad-redirect | 269 + 10.5771 | link-loop | 238 + 10.6068 | | 441 + 10.6068 | no-pdf-link | 384 + 10.6084 | | 917 + 10.6084 | no-pdf-link | 520 + 10.6084 | success | 368 + 10.7287 | | 234 + 10.7287 | no-pdf-link | 212 + 10.7312 | | 382 + 10.7312 | link-loop | 291 + 10.7554 | | 205 + 10.7891 | | 380 + 10.7891 | no-pdf-link | 376 + 10.7916 | | 331 + 10.7916 | no-pdf-link | 201 + 10.7939 | | 535 + 10.7939 | no-pdf-link | 527 + | | 272831 + | success | 62298 + | no-pdf-link | 60737 + | link-loop | 48558 + | redirect-loop | 26842 + | terminal-bad-status | 22685 + | spn2-error:too-many-redirects | 11174 + | forbidden | 10900 + | spn2-wayback-error | 7796 + | blocked-cookie | 6961 + | not-found | 5468 + | bad-redirect | 2666 + | spn2-error | 2398 + | spn2-cdx-lookup-failure | 1374 + | petabox-error | 678 + | remote-server-error | 461 + | wrong-mimetype | 443 + | spn2-error:proxy-error | 420 + | spn2-error:unknown | 360 +(323 rows) diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt new file mode 100644 index 0000000..28dd0d0 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt @@ -0,0 +1,307 @@ + domain | status | count +-------------------------------------------------------------------+-------------------------------+-------- + 202.148.31.178 | | 298 + academic.oup.com | | 1624 + academic.oup.com | no-pdf-link | 673 + academic.oup.com | bad-redirect | 444 + academic.oup.com | link-loop | 358 + aip.scitation.org | | 257 + apps.crossref.org | | 1414 + apps.crossref.org | no-pdf-link | 1410 + article.sciencepublishinggroup.com | | 404 + article.sciencepublishinggroup.com | success | 404 + arxiv.org | | 24340 + arxiv.org | success | 22381 + arxiv.org | terminal-bad-status | 1260 + arxiv.org | no-pdf-link | 412 + arxiv.org | no-capture | 262 + ashpublications.org | | 2049 + ashpublications.org | no-pdf-link | 2024 + asmedigitalcollection.asme.org | | 1245 + asmedigitalcollection.asme.org | bad-redirect | 707 + assets.researchsquare.com | | 1549 + assets.researchsquare.com | success | 1546 + bioone.org | | 201 + biorxiv.org | redirect-loop | 702 + biorxiv.org | | 702 + blogs.ethz.ch | | 687 + blogs.ethz.ch | no-pdf-link | 686 + books.openedition.org | | 446 + books.openedition.org | redirect-loop | 382 + brill.com | | 2203 + brill.com | link-loop | 1779 + brill.com | success | 359 + catalog.paradisec.org.au | | 770 + catalog.paradisec.org.au | redirect-loop | 756 + cdr.lib.unc.edu | | 9432 + cdr.lib.unc.edu | redirect-loop | 5720 + cdr.lib.unc.edu | spn2-wayback-error | 3187 + cdr.lib.unc.edu | spn2-cdx-lookup-failure | 201 + classiques-garnier.com | | 246 + classiques-garnier.com | success | 243 + content.iospress.com | | 242 + content.taylorfrancis.com | | 309 + content.taylorfrancis.com | terminal-bad-status | 309 + curve.carleton.ca | success | 201 + curve.carleton.ca | | 201 + cyberdoi.ru | redirect-loop | 647 + cyberdoi.ru | | 647 + czasopisma.kul.pl | | 402 + czasopisma.kul.pl | success | 294 + d.lib.msu.edu | | 550 + d.lib.msu.edu | no-pdf-link | 550 + d197for5662m48.cloudfront.net | success | 276 + d197for5662m48.cloudfront.net | | 276 + dergipark.org.tr | | 674 + dergipark.org.tr | no-pdf-link | 255 + dergipark.org.tr | success | 248 + digi.ub.uni-heidelberg.de | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 261 + dl.acm.org | | 441 + dl.acm.org | blocked-cookie | 361 + dlc.library.columbia.edu | | 201 + dlc.library.columbia.edu | no-pdf-link | 201 + doi.ala.org.au | | 308 + doi.ala.org.au | no-pdf-link | 308 + doi.org | | 474 + doi.org | terminal-bad-status | 344 + downloads.hindawi.com | | 479 + downloads.hindawi.com | success | 478 + edoc.rki.de | | 238 + edoc.unibas.ch | | 2018 + edoc.unibas.ch | success | 1067 + edoc.unibas.ch | terminal-bad-status | 817 + elib.spbstu.ru | | 205 + elifesciences.org | | 204 + era.library.ualberta.ca | | 531 + era.library.ualberta.ca | no-pdf-link | 527 + erj.ersjournals.com | | 951 + erj.ersjournals.com | redirect-loop | 829 + europepmc.org | | 289 + europepmc.org | success | 283 + figshare.com | | 233 + figshare.com | no-pdf-link | 208 + fjfsdata01prod.blob.core.windows.net | | 1430 + fjfsdata01prod.blob.core.windows.net | success | 1418 + hw.oeaw.ac.at | | 283 + hw.oeaw.ac.at | no-pdf-link | 283 + idb.ub.uni-tuebingen.de | | 216 + idb.ub.uni-tuebingen.de | terminal-bad-status | 215 + ieeexplore.ieee.org | | 7561 + ieeexplore.ieee.org | spn2-error:too-many-redirects | 6732 + ieeexplore.ieee.org | success | 683 + ijgc.bmj.com | | 411 + ijgc.bmj.com | success | 399 + jamanetwork.com | | 229 + jitc.bmj.com | | 849 + jitc.bmj.com | success | 773 + journals.aps.org | | 539 + journals.aps.org | not-found | 534 + journals.lww.com | | 1124 + journals.lww.com | no-pdf-link | 547 + journals.lww.com | link-loop | 399 + journals.openedition.org | | 7366 + journals.openedition.org | success | 3484 + journals.openedition.org | spn2-wayback-error | 2120 + journals.openedition.org | redirect-loop | 1720 + journals.plos.org | | 552 + journals.plos.org | success | 542 + kiss.kstudy.com | | 306 + kiss.kstudy.com | no-pdf-link | 292 + lib.dr.iastate.edu | | 297 + link.springer.com | | 2830 + link.springer.com | redirect-loop | 2625 + linkinghub.elsevier.com | | 970 + linkinghub.elsevier.com | forbidden | 415 + linkinghub.elsevier.com | spn2-error:too-many-redirects | 357 + medrxiv.org | | 287 + medrxiv.org | redirect-loop | 287 + muse.jhu.edu | | 470 + muse.jhu.edu | terminal-bad-status | 443 + ojs.ub.uni-konstanz.de | | 463 + ojs.ub.uni-konstanz.de | success | 462 + onlinelibrary.wiley.com | | 2064 + onlinelibrary.wiley.com | terminal-bad-status | 1973 + osf.io | | 1394 + osf.io | redirect-loop | 589 + osf.io | spn2-wayback-error | 425 + osf.io | not-found | 342 + othes.univie.ac.at | | 424 + othes.univie.ac.at | success | 424 + oxford.universitypressscholarship.com | | 8999 + oxford.universitypressscholarship.com | link-loop | 8282 + oxford.universitypressscholarship.com | no-pdf-link | 695 + oxfordhandbooks.com | redirect-loop | 460 + oxfordhandbooks.com | | 460 + papers.ssrn.com | | 1313 + papers.ssrn.com | link-loop | 1145 + peerj.com | | 313 + peerj.com | no-pdf-link | 212 + periodicos.urca.br | | 446 + periodicos.urca.br | success | 439 + pos.sissa.it | | 277 + pos.sissa.it | success | 262 + preprints.jmir.org | | 242 + pressto.amu.edu.pl | | 260 + pressto.amu.edu.pl | success | 232 + publikationsserver.tu-braunschweig.de | | 15358 + publikationsserver.tu-braunschweig.de | no-pdf-link | 15228 + publons.com | | 2810 + publons.com | redirect-loop | 2359 + publons.com | no-pdf-link | 444 + pubs.acs.org | | 1647 + pubs.acs.org | blocked-cookie | 1553 + pubs.rsc.org | | 765 + pubs.rsc.org | redirect-loop | 486 + pubs.rsc.org | spn2-wayback-error | 214 + res.mdpi.com | | 3620 + res.mdpi.com | success | 3591 + revistas.usal.es | | 580 + revistas.usal.es | success | 298 + revues.imist.ma | | 229 + rsdjournal.org | | 213 + rsdjournal.org | success | 212 + s3-eu-west-1.amazonaws.com | | 764 + s3-eu-west-1.amazonaws.com | success | 763 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | | 324 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | success | 324 + saspublishers.com | | 213 + saspublishers.com | success | 213 + scholarshare.temple.edu | | 524 + scholarshare.temple.edu | success | 464 + sol.sbc.org.br | | 268 + sol.sbc.org.br | success | 264 + statisticaldatasets.data-planet.com | | 442 + statisticaldatasets.data-planet.com | no-pdf-link | 390 + watermark.silverchair.com | | 521 + watermark.silverchair.com | success | 514 + www.ahajournals.org | | 1061 + www.ahajournals.org | blocked-cookie | 1011 + www.atlantis-press.com | | 308 + www.atlantis-press.com | spn2-wayback-error | 228 + www.beck-elibrary.de | | 1202 + www.beck-elibrary.de | no-pdf-link | 953 + www.beck-elibrary.de | bad-redirect | 249 + www.cairn.info | | 255 + www.cairn.info | redirect-loop | 208 + www.cambridge.org | | 2061 + www.cambridge.org | no-pdf-link | 727 + www.cambridge.org | success | 485 + www.cambridge.org | link-loop | 388 + www.cambridge.org | bad-redirect | 252 + www.confer.cz | | 227 + www.confer.cz | no-pdf-link | 226 + www.dbpia.co.kr | | 773 + www.dbpia.co.kr | no-pdf-link | 679 + www.degruyter.com | | 17046 + www.degruyter.com | link-loop | 14202 + www.degruyter.com | success | 2201 + www.degruyter.com | not-found | 235 + www.dovepress.com | | 316 + www.dovepress.com | success | 267 + www.e-manuscripta.ch | | 384 + www.e-manuscripta.ch | no-pdf-link | 383 + www.e-periodica.ch | | 358 + www.e-periodica.ch | no-pdf-link | 355 + www.e-rara.ch | no-pdf-link | 279 + www.e-rara.ch | | 279 + www.e3s-conferences.org | | 426 + www.e3s-conferences.org | success | 419 + www.elibrary.ru | | 303 + www.elibrary.ru | no-pdf-link | 301 + www.emerald.com | | 943 + www.emerald.com | no-pdf-link | 933 + www.etasr.com | | 466 + www.etasr.com | success | 466 + www.eurekaselect.com | | 345 + www.eurekaselect.com | no-pdf-link | 321 + www.europeanproceedings.com | | 218 + www.europeanproceedings.com | success | 218 + www.finersistemas.com | success | 397 + www.finersistemas.com | | 397 + www.humankineticslibrary.com | no-pdf-link | 321 + www.humankineticslibrary.com | | 321 + www.ijcmas.com | | 251 + www.ijcmas.com | no-pdf-link | 248 + www.inderscience.com | | 524 + www.inderscience.com | no-pdf-link | 501 + www.ingentaconnect.com | | 366 + www.ingentaconnect.com | no-pdf-link | 349 + www.jstage.jst.go.jp | | 1591 + www.jstage.jst.go.jp | success | 862 + www.jstage.jst.go.jp | no-pdf-link | 567 + www.jstor.org | | 351 + www.karger.com | | 224 + www.liebertpub.com | | 236 + www.liebertpub.com | blocked-cookie | 228 + www.mdpi.com | | 694 + www.mdpi.com | terminal-bad-status | 480 + www.medlit.ru | | 458 + www.medlit.ru | redirect-loop | 366 + www.morressier.com | | 285 + www.morressier.com | no-pdf-link | 253 + www.njca.info | | 223 + www.njca.info | remote-server-error | 222 + www.nomos-elibrary.de | | 913 + www.nomos-elibrary.de | no-pdf-link | 379 + www.nomos-elibrary.de | bad-redirect | 265 + www.nomos-elibrary.de | link-loop | 236 + www.onepetro.org | | 895 + www.onepetro.org | redirect-loop | 853 + www.osti.gov | | 212 + www.persee.fr | | 232 + www.persee.fr | terminal-bad-status | 213 + www.repository.cam.ac.uk | | 439 + www.research-collection.ethz.ch | | 312 + www.research-collection.ethz.ch | terminal-bad-status | 310 + www.revistas.ufg.br | | 212 + www.schoeningh.de | | 371 + www.schoeningh.de | link-loop | 366 + www.scialert.net | | 276 + www.scialert.net | redirect-loop | 254 + www.scielo.br | | 644 + www.scielo.br | success | 624 + www.sciencedirect.com | | 6523 + www.sciencedirect.com | no-pdf-link | 4668 + www.sciencedirect.com | terminal-bad-status | 1737 + www.scitepress.org | no-pdf-link | 397 + www.scitepress.org | | 397 + www.tandfonline.com | | 3448 + www.tandfonline.com | blocked-cookie | 2446 + www.tandfonline.com | terminal-bad-status | 714 + www.taylorfrancis.com | | 21292 + www.taylorfrancis.com | link-loop | 18648 + www.taylorfrancis.com | forbidden | 2022 + www.taylorfrancis.com | terminal-bad-status | 518 + www.thieme-connect.de | | 513 + www.thieme-connect.de | not-found | 292 + www.thieme-connect.de | redirect-loop | 213 + www.whateveryoneneedstoknow.com | | 1174 + www.whateveryoneneedstoknow.com | redirect-loop | 1163 + www.worldscientific.com | | 293 + www.worldscientific.com | blocked-cookie | 240 + www.zora.uzh.ch | | 290 + www.zora.uzh.ch | redirect-loop | 278 + zenodo.org | | 22202 + zenodo.org | terminal-bad-status | 12158 + zenodo.org | success | 4923 + zenodo.org | no-pdf-link | 4788 + | | 280719 + | success | 85143 + | no-pdf-link | 61335 + | link-loop | 48566 + | redirect-loop | 26845 + | terminal-bad-status | 23955 + | spn2-wayback-error | 7920 + | spn2-error:too-many-redirects | 7175 + | blocked-cookie | 6980 + | forbidden | 2912 + | bad-redirect | 2666 + | spn2-error | 1943 + | not-found | 1762 + | spn2-cdx-lookup-failure | 1376 + | wrong-mimetype | 467 + | remote-server-error | 388 + | spn2-error:proxy-error | 295 + | no-capture | 262 +(304 rows) diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000..a9f5260 --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000..d68895b --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql new file mode 100644 index 0000000..a085480 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/down.sql @@ -0,0 +1,8 @@ + +DROP TABLE IF NOT EXISTS cdx; +DROP TABLE IF NOT EXISTS file_meta; +DROP TABLE IF NOT EXISTS fatcat_file; +DROP TABLE IF NOT EXISTS petabox; +DROP TABLE IF NOT EXISTS grobid; +DROP TABLE IF NOT EXISTS ingest_request; +DROP TABLE IF NOT EXISTS shadow; diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql new file mode 100644 index 0000000..10a5183 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -0,0 +1,184 @@ + +-- rows *may* be revisit records; indicated by mimetype == "warc/revisit" +-- records are implied to be 200 status (or 226 for ftp); either direct hits or +-- revisits +-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many +-- datetimes. import scripts should take efforts to reduce this sort of +-- duplication though. one row per *domain*/sha1hex pair is a good guideline. +-- all ingest result url/dt pairs should be included though. +-- any mimetype is allowed, but presumption should be that actual body is full +-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each +-- only a part of work). URLs that are parts of a fileset are allowed. +CREATE TABLE IF NOT EXISTS cdx ( + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + -- sha1hex/cdx_sha1hex difference is intended to help with difference between + -- CDX hash (which is transport encoded body) vs. actual body. Probably need to + -- include both for all records? + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + -- TODO: enforce that only paths with '/' (item+file) should be included? + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_csize BIGINT, + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +-- TODO: remove this index? not currently used +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +-- TODO: require all fields. if mimetype unknown, should be octet-stream +CREATE TABLE IF NOT EXISTS file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); +CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); + +CREATE TABLE IF NOT EXISTS fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE IF NOT EXISTS petabox ( + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + PRIMARY KEY(item, path) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE IF NOT EXISTS grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + -- extracted basic biblio metadata: + -- title + -- authors[] + -- full/display + -- given_name + -- surname + -- affiliation + -- year + -- journal_issn + -- journal_name + -- refs_count + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL +); + +CREATE TABLE IF NOT EXISTS pdf_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, + page_count INT CHECK (page_count >= 0), + word_count INT CHECK (word_count >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), + permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), + pdf_created TIMESTAMP WITH TIME ZONE, + pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), + metadata JSONB + -- maybe some analysis of available fields? + -- metadata JSON fields: + -- title + -- subject + -- author + -- creator + -- producer + -- CrossMarkDomains + -- doi + -- form + -- encrypted +); + +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + biblio JSONB, + resources JSONB + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + +CREATE TABLE IF NOT EXISTS ingest_request ( + link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), + link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1), + created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + release_stage TEXT CHECK (octet_length(release_stage) >= 1), + request JSONB, + -- request isn't required, but can stash extra fields there for import, eg: + -- ext_ids (source/source_id sometimes enough) + -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) + -- edit_extra + -- ingest type can be: pdf, xml, html + + PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) +); +CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); + +CREATE TABLE IF NOT EXISTS ingest_file_result ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), + terminal_status_code INT, + terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40), + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); +CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); + +CREATE TABLE IF NOT EXISTS shadow ( + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md new file mode 100644 index 0000000..0859e79 --- /dev/null +++ b/sql/monitoring_queries.md @@ -0,0 +1,202 @@ + +## fatcat-changelog pipeline + +Overall ingest status, past 30 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +Broken domains, past 30 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Summary of significant domains and status, past 7 days: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + +Summary of DOI prefix and status, past 7 days: + + SELECT doi_prefix, status, count + FROM ( + SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_request.link_source = 'doi' + ) t1 + WHERE t1.doi_prefix != '' + GROUP BY CUBE (doi_prefix, status) + ) t2 + WHERE count > 200 + ORDER BY doi_prefix ASC , count DESC; + + +Throughput per day, and success, for past 30 days: + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) + ORDER BY date(ingest_request.created) DESC; + +## fatcat-ingest + +Broken domains, past 7 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '7 day'::INTERVAL + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past 7 days: + + SELECT ingest_request.ingest_type, + date(ingest_file_result.updated), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '7 day'::INTERVAL + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; + +Overall status, updated requests past 3 days: + + SELECT ingest_request.ingest_type, + ingest_file_result.status, + COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.status + ORDER BY COUNT(*) DESC; + +## savepapernow and fatcat-ingest recent status + +Specific recent ingests (for debugging): + + -- for record layout: \x + SELECT + ingest_file_result.status as status, + ingest_request.ingest_type as ingest_type, + ingest_request.ingest_request_source as source, + ingest_request.link_source_id as source_id, + ingest_request.base_url as base_url, + ingest_file_result.terminal_dt as dt, + ingest_file_result.terminal_status_code as status_code, + ingest_file_result.terminal_sha1hex as sha1hex, + grobid.status as grobid_status + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid + ON ingest_file_result.terminal_sha1hex = grobid.sha1hex + WHERE + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + -- AND ingest_request.ingest_type = 'pdf' + -- AND ingest_request.ingest_type = 'html' + AND ( + ingest_request.ingest_request_source = 'savepapernow-web' + -- OR ingest_request.ingest_request_source = 'fatcat-ingest' + ) + ORDER BY ingest_file_result.updated DESC + LIMIT 100; + diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md new file mode 100644 index 0000000..06f718c --- /dev/null +++ b/sql/pdftrio_queries.md @@ -0,0 +1,65 @@ + +## Counts / Status + + SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code; + + # NOTE: I earlier deleted a large fraction of non-200 status codes, so + # these aren't representative + status_code | count + -------------+--------- + -4 | 16 + -2 | 26 + 200 | 1117501 + 400 | 2695 + (4 rows) + + + SELECT status, COUNT(*) FROM pdftrio GROUP BY status; + + status | count + ---------------+--------- + error | 2696 + error-connect | 26 + error-timeout | 16 + success | 1118252 + (4 rows) + + SELECT + COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count, + COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count, + COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count, + COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count + FROM pdftrio; + + + ensemble_count | linear_count | bert_count | image_count + ----------------+--------------+------------+------------- + 1120100 | 976271 | 66209 | 143829 + (1 row) + +## Histograms + + SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND ensemble_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND bert_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND linear_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND image_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + diff --git a/sql/random_queries.md b/sql/random_queries.md new file mode 100644 index 0000000..572b4f9 --- /dev/null +++ b/sql/random_queries.md @@ -0,0 +1,193 @@ + +Basic stats (2019-09-23): + + SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 28,023,760 + => Time: 253897.213 ms (04:13.897) + + SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 22,816,087 + => Time: 287097.944 ms (04:47.098) + + SELECT COUNT(*) FROM grobid. + => 56,196,992 + + SELECT COUNT(DISTINCT sha1hex) FROM cdx; + => 64,348,277 + => Time: 572383.931 ms (09:32.384) + + SELECT COUNT(*) FROM cdx; + => 74,796,777 + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + => Time: 189067.335 ms (03:09.067) + + mimetype | count + ------------------------+---------- + application/pdf | 51049905 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +Time: 189067.335 ms (03:09.067) + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC; + + status_code | count + -------------+---------- + 200 | 56196992 + + compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount: + + 200 49567139 + 400 3464503 + 409 691917 + 500 247028 + 503 123 + + SELECT row_to_json(cdx) FROM cdx LIMIT 5; + + SELECT row_to_json(r) FROM ( + SELECT url, datetime FROM cdx + ) r + LIMIT 5; + +More stats (2019-12-27): + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20; + + SELECT SUM(size_bytes) FROM file_meta; + +"Last 24 hour progress": + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + +## Parse URLs + +One approach is to do regexes, something like: + + SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; + +Eg: + + SELECT DISTINCT(domain), COUNT(domain) + FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 10; + +Or: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + +Can also do some quick lookups for a specific domain and protocol like: + + SELECT * + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + +For a given DOI prefix: + + SELECT * + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/a%' + AND status = 'no-pdf-link' + LIMIT 10; + + SELECT status, count(*) + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/%' + GROUP BY status + ORDER BY count(*) DESC; + +## Bulk Ingest + +Show bulk ingest status on links *added* in the past week: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 25; + +Top *successful* domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; + +Summarize non-success domains for the same: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh new file mode 100755 index 0000000..20fd82b --- /dev/null +++ b/sql/reingest_quarterly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_quarterly_current.json + +cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ + | shuf \ + | head -n120000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh new file mode 100755 index 0000000..6fb1e4b --- /dev/null +++ b/sql/reingest_spn.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_spn.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \ + > /srv/sandcrawler/tasks/reingest_spn.json + +cat /srv/sandcrawler/tasks/reingest_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh new file mode 100755 index 0000000..04ce39d --- /dev/null +++ b/sql/reingest_weekly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_weekly_current.json + +cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql new file mode 120000 index 0000000..a3756d4 --- /dev/null +++ b/sql/sandcrawler_schema.sql @@ -0,0 +1 @@ +migrations/2019-12-19-060141_init/up.sql
\ No newline at end of file diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt new file mode 100644 index 0000000..444e448 --- /dev/null +++ b/sql/stats/2020-01-13_stats.txt @@ -0,0 +1,190 @@ + +## SQL Table Sizes + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid" | 38 GB | 7076 MB | 45 GB + "public"."file_meta" | 23 GB | 11 GB | 34 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."ingest_request" | 363 MB | 625 MB | 988 MB + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + + total_count | total_size + -------------+----------------- + 118823340 | 140917467253923 + (1 row) + + # 118,823,340 => 118 million + # 140,917,467,253,923 => ~141 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 117185567 + | 1509149 + application/octet-stream | 87783 + text/html | 9901 + application/postscript | 3781 + application/vnd.ms-powerpoint | 1421 + text/plain | 1151 + application/xml | 427 + application/gzip | 414 + application/msword | 314 + (10 rows) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96141851 | 110030179 + (1 row) + + # 96,141,851 + # 110,030,179 + +Top mimetypes (not unique by sha1): + + mimetype | count + ------------------------+---------- + application/pdf | 84582642 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 13675190 | 59919772 + + # 13,675,190 + # 59,919,772 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 57382904 + 500 | 2536862 + 503 | 6 + (3 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 41699385 + | 15683279 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + # 2,868,825 + # 2,887,834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 2816171 + pdf | arxiv | 154448 + pdf | spn | 55 + pdf | pubmed | 2 + (4 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+------- + (0 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+--------+---------- + pdf | doi | 2816171 | 289199 | 0.103 + pdf | arxiv | 154448 | 41105 | 0.266 + pdf | spn | 55 | 46 | 0.836 + pdf | pubmed | 2 | 0 | 0.000 + (4 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+---------------------+--------- + pdf | no-pdf-link | 2213720 + pdf | success | 330492 + pdf | spn-remote-error | 182157 + pdf | spn-error | 141222 + pdf | cdx-error | 83131 + pdf | link-loop | 11350 + pdf | other-mimetype | 6089 + pdf | null-body | 1980 + pdf | terminal-bad-status | 583 + pdf | wayback-error | 381 + (10 rows) + diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt new file mode 100644 index 0000000..6bd43ea --- /dev/null +++ b/sql/stats/2020-01-31_supplement.txt @@ -0,0 +1,42 @@ + +How many file_meta still missing core metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 1,130,915 + +Great! Not many. + +And are in petabox? + + SELECT COUNT(*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL; + => 1,149,194 + +Almost all; maybe just some CDX fetch failures or something in there. So, +should run these on, eg, grobid2-vm. + + COPY ( + SELECT row_to_json(petabox.*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL + ) TO '/grande/snapshots/dump_grobid_petabox_todo.json'; + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a +couple million files): + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | count + -------------+--------- + 5072452 | 4130405 + diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt new file mode 100644 index 0000000..e7a00e8 --- /dev/null +++ b/sql/stats/2020-02-24_stats.txt @@ -0,0 +1,482 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + Size: 271.83G + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB + "public"."grobid" | 47 GB | 7213 MB | 54 GB + "public"."file_meta" | 26 GB | 12 GB | 38 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB + "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."pdftrio" | 78 MB | 64 MB | 142 MB + (10 rows) + + +## File Metadata + +(skipping, no update) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 15,632,810 | 76,555,791 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 70656028 + 500 | 5896836 + -4 | 2295 + 503 | 111 + (4 rows) + + What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 56001631 + | 14654496 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2,868,825 | 2,887,834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591633 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + pdf | pubmed | 2 + (6 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | | 2943896 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | | 629719 + pdf | doi | fatcat-changelog | 129932 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | arxiv | fatcat-ingest | 998 + pdf | spn | | 64 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | spn | savepapernow-web | 18 + pdf | pubmed | | 2 + pdf | doi | savepapernow-web | 1 + (14 rows) + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + => UPDATE 2943896 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + => UPDATE 64 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + => UPDATE 629719 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + => UPDATE 1454 + + SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed'; + DELETE FROM ingest_request WHERE link_source = 'pubmed'; + => DELETE 2 + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591637 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + (5 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | fatcat-changelog | 3073828 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630717 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | fatcat-ingest | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | spn | savepapernow-web | 82 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 1 + (10 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + none? + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+---------+---------- + pdf | doi | 6591637 | 1622702 | 0.246 + pdf | pmc | 2030279 | 1241836 | 0.612 + pdf | arxiv | 630743 | 500620 | 0.794 + pdf | unpaywall | 1400 | 851 | 0.608 + pdf | spn | 82 | 62 | 0.756 + (5 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------------+--------- + pdf | success | 3366189 + pdf | no-pdf-link | 2902620 + pdf | no-capture | 1672025 + pdf | redirect-loop | 388844 + pdf | cdx-error | 272780 + pdf | terminal-bad-status | 171878 + pdf | spn-remote-error | 163843 + pdf | spn-error | 108070 + pdf | null-body | 66778 + pdf | link-loop | 43403 + pdf | skip-url-blocklist | 34705 + pdf | wrong-mimetype | 31343 + pdf | wayback-error | 13012 + pdf | spn2-cdx-lookup-failure | 6100 + pdf | gateway-timeout | 5633 + pdf | other-mimetype | 5114 + pdf | spn2-error:proxy-error | 538 + pdf | spn2-error:job-failed | 470 + pdf | petabox-error | 415 + pdf | spn2-error:browser-running-error | 136 + pdf | spn2-error | 127 + pdf | spn2-error:soft-time-limit-exceeded | 71 + pdf | bad-redirect | 39 + pdf | spn2-error:unknown | 30 + pdf | spn2-error:browsing-timeout | 25 + pdf | pending | 3 + pdf | invalid-host-resolution | 1 + (27 rows) + + +## Fatcat Files + +(skipping, no update) + +## Recent Success/Failure of Ingest by Domain + +NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a +small fraction of this is from changelog. + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + domain | status | count + -------------------------+----------------+------- + linkinghub.elsevier.com | no-capture | 2579 + www.mdpi.com | wrong-mimetype | 1313 + onlinelibrary.wiley.com | no-pdf-link | 785 + americanarchivist.org | no-pdf-link | 756 + journals.sagepub.com | redirect-loop | 503 + link.springer.com | redirect-loop | 432 + iopscience.iop.org | no-capture | 392 + www.tandfonline.com | no-pdf-link | 389 + pubs.rsc.org | no-capture | 361 + www.persee.fr | no-capture | 344 + (10 rows) + + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | success | 40578 + pdf | cdx-error | 14982 + pdf | no-capture | 7747 + pdf | no-pdf-link | 7111 + pdf | redirect-loop | 3265 + pdf | wrong-mimetype | 1629 + pdf | spn2-cdx-lookup-failure | 657 + pdf | link-loop | 538 + pdf | null-body | 517 + pdf | terminal-bad-status | 400 + pdf | wayback-error | 79 + pdf | spn2-error:job-failed | 53 + pdf | gateway-timeout | 38 + pdf | spn2-error:soft-time-limit-exceeded | 7 + pdf | spn2-error | 6 + pdf | petabox-error | 5 + pdf | spn2-error:browsing-timeout | 4 + pdf | spn2-error:unknown | 2 + pdf | bad-redirect | 1 + pdf | pending | 1 + (20 rows) + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + + ingest_type | date | total | success + -------------+------------+---------+--------- + pdf | 2020-02-25 | 32660 | 14322 + pdf | 2020-02-24 | 44967 | 26263 + pdf | 2020-02-23 | 58795 | 18874 + pdf | 2020-02-22 | 844249 | 272606 + pdf | 2020-02-21 | 1287378 | 433487 + pdf | 2020-02-20 | 1455943 | 492408 + pdf | 2020-02-19 | 21453 | 7529 + pdf | 2020-02-18 | 5863 | 2926 + pdf | 2020-02-17 | 3737 | 970 + pdf | 2020-02-16 | 13779 | 4862 + pdf | 2020-02-15 | 1021020 | 623020 + pdf | 2020-02-14 | 1036036 | 632830 + pdf | 2020-02-13 | 13503 | 5824 + pdf | 2020-02-12 | 20078 | 11422 + pdf | 2020-02-11 | 13499 | 6781 + pdf | 2020-02-10 | 2275 | 961 + pdf | 2020-02-09 | 3231 | 1494 + pdf | 2020-02-08 | 8967 | 4400 + pdf | 2020-02-07 | 7022 | 2430 + pdf | 2020-02-06 | 1291 | 516 + pdf | 2020-02-05 | 8586 | 6596 + pdf | 2020-02-04 | 3681 | 3593 + pdf | 2020-02-03 | 284 | 284 + pdf | 2020-02-02 | 480 | 480 + pdf | 2020-02-01 | 489 | 336 + pdf | 2020-01-31 | 1187 | 1130 + pdf | 2020-01-30 | 1613 | 1288 + pdf | 2020-01-29 | 947 | 279 + pdf | 2020-01-28 | 667 | 323 + (29 rows) + +Top "no-capture" domains (will need to re-ingest using live tool): + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------+------------+-------- + linkinghub.elsevier.com | no-capture | 320065 + iopscience.iop.org | no-capture | 46858 + pubs.rsc.org | no-capture | 43331 + www.persee.fr | no-capture | 38971 + www.doiserbia.nb.rs | no-capture | 27112 + academic.oup.com | no-capture | 18877 + www.osapublishing.org | no-capture | 17113 + osf.io | no-capture | 16978 + scripts.iucr.org | no-capture | 14844 + www.degruyter.com | no-capture | 8093 + mab-online.nl | no-capture | 6603 + insights.ovid.com | no-capture | 6457 + ir.lib.uth.gr | no-capture | 3625 + www.sciencedirect.com | no-capture | 3244 + www.tandfonline.com | no-capture | 3201 + www.ccsenet.org | no-capture | 2849 + www.intechopen.com | no-capture | 2813 + primary-hospital-care.ch | no-capture | 2774 + www.nature.com | no-capture | 2484 + www.indianjournals.com | no-capture | 2432 + journals.aps.org | no-capture | 2197 + journals.sagepub.com | no-capture | 2064 + www.episodes.org | no-capture | 1805 + periodicos.uninove.br | no-capture | 1692 + escholarship.org | no-capture | 1666 + (25 rows) + +Top "no-pdf-link" domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-pdf-link' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -----------------------------+-------------+-------- + plutof.ut.ee | no-pdf-link | 685315 + www.gbif.org | no-pdf-link | 670647 + doi.pangaea.de | no-pdf-link | 301984 + www.plate-archive.org | no-pdf-link | 209218 + onlinelibrary.wiley.com | no-pdf-link | 84890 + figshare.com | no-pdf-link | 72892 + zenodo.org | no-pdf-link | 45768 + www.tandfonline.com | no-pdf-link | 43848 + data.mendeley.com | no-pdf-link | 42367 + springernature.figshare.com | no-pdf-link | 35941 + dhz.uni-passau.de | no-pdf-link | 29187 + www.frontiersin.org | no-pdf-link | 17925 + digital.ucd.ie | no-pdf-link | 16769 + mr.crossref.org | no-pdf-link | 14999 + journals.lww.com | no-pdf-link | 12122 + musewide.aip.de | no-pdf-link | 10854 + datadryad.org | no-pdf-link | 10686 + www.jstor.org | no-pdf-link | 9159 + koreascience.or.kr | no-pdf-link | 9067 + easy.dans.knaw.nl | no-pdf-link | 8264 + scielo.conicyt.cl | no-pdf-link | 8069 + www.degruyter.com | no-pdf-link | 7989 + www.kci.go.kr | no-pdf-link | 6990 + www.m-hikari.com | no-pdf-link | 6941 + cshprotocols.cshlp.org | no-pdf-link | 6553 + (25 rows) + +Top block-ish domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status') + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + ---------------------------------+---------------------+------- + journals.openedition.org | redirect-loop | 30395 + ieeexplore.ieee.org | redirect-loop | 28926 + www.degruyter.com | redirect-loop | 18891 + www.cairn.info | link-loop | 8919 + www.frontiersin.org | terminal-bad-status | 6786 + projecteuclid.org | link-loop | 6098 + www.mdpi.com | terminal-bad-status | 5189 + medicalforum.ch | terminal-bad-status | 4596 + jrnl.nau.edu.ua | link-loop | 4238 + www.revistas.unam.mx | link-loop | 3926 + journals.aps.org | redirect-loop | 3696 + www.ijcseonline.org | redirect-loop | 3567 + www.researchsquare.com | terminal-bad-status | 3453 + www.persee.fr | terminal-bad-status | 3221 + www.baltistica.lt | link-loop | 2098 + osf.io | redirect-loop | 2004 + seer.ufrgs.br | terminal-bad-status | 2002 + jtd.amegroups.com | link-loop | 1738 + www.hindawi.com | terminal-bad-status | 1613 + linkinghub.elsevier.com | redirect-loop | 1612 + www.scienceopen.com | terminal-bad-status | 1580 + atm.amegroups.com | link-loop | 1571 + scielo.conicyt.cl | terminal-bad-status | 1491 + repozytorium.ur.edu.pl | redirect-loop | 1279 + agupubs.onlinelibrary.wiley.com | link-loop | 1182 + (25 rows) + diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt new file mode 100644 index 0000000..55f0c1e --- /dev/null +++ b/sql/stats/2020-05-03_stats.txt @@ -0,0 +1,418 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 41 GB | 82 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 59 GB | 7604 MB | 66 GB + "public"."file_meta" | 31 GB | 28 GB | 59 GB + "public"."ingest_request" | 19 GB | 20 GB | 39 GB + "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (10 rows) + + Size: 383.93G + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 158059828 | 197346217653010 + (1 row) + + => 158 million, 197 terabytes + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 157805029 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6040 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1027125 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + + unique_sha1 | total + -------------+----------- + 92936564 | 111022039 + (1 row) + + => 110 million rows, 92.9 million files + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------------------------------------+----------- + application/pdf | 104178718 + warc/revisit | 5274410 + text/xml | 519042 + text/html | 295523 + application/octet-stream | 259681 + unk | 138930 + application/postscript | 81065 + application/save | 80765 + binary/octet-stream | 59804 + application/x-download | 27083 + text/plain | 26938 + application/download | 25125 + image/pdf | 16095 + application/force-download | 9004 + application/x-msdownload | 3711 + application | 2934 + application/x-octetstream | 2926 + multipart/form-data | 2741 + application/x-pdf | 2444 + .pdf | 2368 + application/binary | 1268 + application/pdf' | 1192 + pdf | 1113 + file/unknown | 1086 + application/unknown | 761 + file | 753 + application/blob | 670 + application/octetstream | 657 + text/pdf | 549 + 0 | 417 + ('application/pdf', | 349 + application/http;msgtype=response | 251 + application/doc | 180 + [...] (wasn't LIMIT 25) + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + + unique_releases | total + -----------------+---------- + 17455441 | 92707544 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057023 + | 14638425 + (2 rows) + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057074 + | 3 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | unpaywall | 26244088 + pdf | mag | 25596658 + pdf | doi | 15652966 + pdf | pmc | 2043646 + pdf | arxiv | 721902 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 103 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | unpaywall | unpaywall | 26244088 + pdf | mag | mag-corpus | 25596658 + pdf | doi | fatcat-ingest | 8267308 + pdf | doi | fatcat-changelog | 3869772 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630719 + pdf | arxiv | fatcat-changelog | 91157 + pdf | pmc | fatcat-ingest | 10195 + pdf | pmc | fatcat-changelog | 4626 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 103 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 15 + (15 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+------- + pdf | mag | 47 + pdf | unpaywall | 1 + (2 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | unpaywall | 26244088 | 19968092 | 0.761 + pdf | mag | 25596658 | 18712912 | 0.731 + pdf | doi | 15653166 | 2878833 | 0.184 + pdf | pmc | 2043646 | 1279529 | 0.626 + pdf | arxiv | 721902 | 592394 | 0.821 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 103 | 82 | 0.796 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 37449502 + pdf | no-pdf-link | 10908442 + pdf | no-capture | 5643670 + pdf | redirect-loop | 4823502 + pdf | terminal-bad-status | 1715056 + pdf | link-loop | 1425072 + pdf | cdx-error | 535365 + pdf | gateway-timeout | 267654 + pdf | skip-url-blocklist | 220433 + pdf | wrong-mimetype | 189804 + pdf | spn2-cdx-lookup-failure | 103926 + pdf | spn-error | 101777 + pdf | wayback-error | 93517 + pdf | null-body | 87279 + pdf | invalid-host-resolution | 35305 + pdf | spn-remote-error | 28888 + pdf | petabox-error | 12406 + pdf | spn2-error | 2905 + pdf | spn2-error:job-failed | 2307 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 745 + pdf | spn2-error:proxy-error | 438 + pdf | spn2-error:invalid-url-syntax | 406 + pdf | spn2-error:soft-time-limit-exceeded | 405 + pdf | spn2-error:browser-running-error | 274 + (25 rows) + +Failures by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | domain | status | count + -------------+---------------------------------------+---------------------+-------- + pdf | ssl.fao.org | no-pdf-link | 862277 + pdf | www.researchgate.net | redirect-loop | 749094 + pdf | www.e-periodica.ch | no-pdf-link | 747370 + pdf | ieeexplore.ieee.org | redirect-loop | 707482 + pdf | plutof.ut.ee | no-pdf-link | 685341 + pdf | www.gbif.org | no-pdf-link | 670905 + pdf | dlc.library.columbia.edu | no-pdf-link | 508281 + pdf | figshare.com | no-pdf-link | 400501 + pdf | onlinelibrary.wiley.com | no-pdf-link | 399187 + pdf | watermark.silverchair.com | terminal-bad-status | 357188 + pdf | www.die-bonn.de | redirect-loop | 352903 + pdf | academic.oup.com | no-pdf-link | 346828 + pdf | iopscience.iop.org | terminal-bad-status | 345147 + pdf | linkinghub.elsevier.com | no-capture | 328434 + pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206 + pdf | cyberleninka.ru | link-loop | 309525 + pdf | www.tandfonline.com | no-pdf-link | 309146 + pdf | dialnet.unirioja.es | terminal-bad-status | 307572 + pdf | doi.pangaea.de | no-pdf-link | 304924 + pdf | journals.sagepub.com | no-pdf-link | 285774 + pdf | papers.ssrn.com | link-loop | 282415 + pdf | dialnet.unirioja.es | redirect-loop | 274476 + pdf | ieeexplore.ieee.org | link-loop | 273607 + pdf | catalog.paradisec.org.au | redirect-loop | 234653 + pdf | www.plate-archive.org | no-pdf-link | 209217 + pdf | zenodo.org | no-pdf-link | 200078 + pdf | zenodo.org | no-capture | 199025 + pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084 + pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039 + pdf | validate.perfdrive.com | no-pdf-link | 180191 + (30 rows) + +Success by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | domain | status | count + -------------+----------------------------+---------+--------- + pdf | www.jstage.jst.go.jp | success | 2244620 + pdf | europepmc.org | success | 1284770 + pdf | link.springer.com | success | 1017998 + pdf | www.scielo.br | success | 799577 + pdf | arxiv.org | success | 592622 + pdf | downloads.hindawi.com | success | 527278 + pdf | res.mdpi.com | success | 501093 + pdf | hal.archives-ouvertes.fr | success | 447877 + pdf | digital.library.unt.edu | success | 404460 + pdf | www.cambridge.org | success | 394666 + pdf | dergipark.org.tr | success | 373706 + pdf | journals.plos.org | success | 296994 + pdf | watermark.silverchair.com | success | 275562 + pdf | www.nature.com | success | 263836 + pdf | cds.cern.ch | success | 223057 + pdf | www.pnas.org | success | 220488 + pdf | s3-eu-west-1.amazonaws.com | success | 214558 + pdf | www.jbc.org | success | 205277 + pdf | www.redalyc.org | success | 193591 + pdf | iopscience.iop.org | success | 175796 + pdf | apps.dtic.mil | success | 170589 + pdf | zenodo.org | success | 167812 + pdf | peerj.com | success | 155620 + pdf | www.biorxiv.org | success | 149337 + pdf | 210.101.116.28 | success | 145706 + pdf | www.teses.usp.br | success | 145438 + pdf | absimage.aps.org | success | 144400 + pdf | hrcak.srce.hr | success | 134669 + pdf | www.erudit.org | success | 131771 + pdf | babel.hathitrust.org | success | 130645 + (30 rows) + + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + => NOT RUN, fatcat_file table is way out of date + diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt new file mode 100644 index 0000000..d1993fc --- /dev/null +++ b/sql/stats/2020-07-23_stats.txt @@ -0,0 +1,347 @@ + +Summary: + +- very many more PDFs have been grobid-ed vs. pdf_meta-ed +- about 1 million file_meta still have partial metadata (eg, no sha256) +- database size still under 0.5 TByte +- there are about a million CDX error ingest requests, and hundreds of + thousands of SPN errors which could be re-run + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 42 GB | 84 GB + "public"."ingest_request" | 34 GB | 39 GB | 73 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 61 GB | 7742 MB | 69 GB + "public"."file_meta" | 32 GB | 29 GB | 61 GB + "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + Size: 466.91G + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 161944425 | 204,402,677,360,189 + (1 row) + + # 161.9 mil; 204 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 161691608 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1015337 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96537611 | 116281981 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------+----------- + application/pdf | 108706978 + warc/revisit | 5912013 + text/xml | 519042 + application/octet-stream | 307782 + text/html | 295634 + unk | 156937 + application/postscript | 81079 + application/save | 80871 + binary/octet-stream | 61263 + text/plain | 31495 + application/x-download | 30511 + application/download | 26716 + image/pdf | 26357 + application/force-download | 10541 + multipart/form-data | 5551 + application/x-msdownload | 3724 + application/x-octetstream | 3216 + application | 3171 + .pdf | 2728 + application/x-pdf | 2563 + application/binary | 1306 + application/pdf' | 1192 + pdf | 1180 + [...] + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + + total_files | unique_releases + -------------+----------------- + 95557413 | 18020570 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + + status_code | count + -------------+---------- + 200 | 88450610 + 500 | 7101098 + -4 | 4133 + 503 | 110 + + SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10; + + status | count + ----------------+---------- + success | 73814297 + | 14638412 + error | 7101308 + error-timeout | 4133 + bad-grobid-xml | 6 + (5 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 73813427 + | 14638425 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 16589669 + pdf | pmc | 2231113 + pdf | arxiv | 794693 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 148 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8320832 + pdf | doi | fatcat-changelog | 4752956 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630750 + pdf | pmc | fatcat-ingest | 194781 + pdf | arxiv | fatcat-changelog | 163924 + pdf | pmc | fatcat-changelog | 7507 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 148 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 19 + pdf | arxiv | savepapernow-web | 2 + +Uncrawled requests by source: + + # TODO: verify this? seems wrong + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | mag | 4097008 + pdf | oai | 15287 + pdf | unpaywall | 1 + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 5346057 | 0.104 + pdf | mag | 35015357 | 22199271 | 0.634 + pdf | unpaywall | 27653003 | 22067338 | 0.798 + pdf | doi | 16589700 | 3207661 | 0.193 + pdf | pmc | 2231113 | 1696976 | 0.761 + pdf | arxiv | 794727 | 645607 | 0.812 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 148 | 114 | 0.770 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 46465271 + pdf | no-capture | 46115869 + pdf | no-pdf-link | 13877460 + pdf | redirect-loop | 5943956 + pdf | terminal-bad-status | 1962754 + pdf | link-loop | 1630078 + pdf | cdx-error | 1014409 + pdf | gateway-timeout | 459340 + pdf | wrong-mimetype | 321774 + pdf | skip-url-blocklist | 220629 + pdf | wayback-error | 220453 + pdf | spn2-cdx-lookup-failure | 143963 + pdf | null-body | 113384 + pdf | spn-error | 101773 + pdf | invalid-host-resolution | 37367 + pdf | spn-remote-error | 28886 + pdf | petabox-error | 22997 + pdf | spn2-error | 16342 + pdf | spn2-error:job-failed | 5017 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 746 + pdf | spn2-error:soft-time-limit-exceeded | 632 + pdf | spn2-error:proxy-error | 437 + pdf | spn2-error:invalid-url-syntax | 417 + pdf | timeout | 417 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + + total_count | release_count + -------------+--------------- + 5862666 | 4728824 + (1 row) + +## PDF Meta + +Total rows: + + SELECT COUNT(*) as total_count FROM pdf_meta; + + + total_count + ------------- + 21961874 + +By status: + + SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC; + + status | count + ----------------+---------- + success | 21788507 + parse-error | 78196 + text-too-large | 60595 + not-pdf | 31679 + error-wayback | 2639 + bad-unicode | 251 + bad-pdf | 6 + empty-blob | 1 + (8 rows) + diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 44 GB | 45 GB | 89 GB + "public"."grobid" | 66 GB | 8127 MB | 74 GB + "public"."ingest_request" | 34 GB | 40 GB | 73 GB + "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."file_meta" | 33 GB | 30 GB | 63 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 167021210 | 221982345333674 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 166765214 + application/octet-stream | 155517 + application/xml | 42170 + text/html | 18708 + text/plain | 15990 + application/gzip | 6491 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62960 + (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 102123051 | 126550160 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ----------------------------+----------- + application/pdf | 116885565 + warc/revisit | 7951816 + text/xml | 519042 + application/octet-stream | 327639 + text/html | 295725 + unk | 172491 + application/postscript | 81095 + application/save | 80900 + binary/octet-stream | 61783 + text/plain | 33684 + image/pdf | 32856 + application/x-download | 32418 + application/download | 27672 + application/force-download | 10892 + multipart/form-data | 5750 + application/x-msdownload | 3832 + application/x-octetstream | 3516 + application | 3499 + .pdf | 3038 + application/x-pdf | 2701 + application/binary | 1322 + pdf | 1232 + file/unknown | 1199 + application/pdf' | 1192 + file | 979 + (25 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 101494314 | 18919012 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 93730358 + 500 | 7759103 + -4 | 4683 + 503 | 150 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 80838234 + | 12892145 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 17362763 + pdf | pmc | 2248854 + pdf | arxiv | 835400 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 197 + (9 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8399261 + pdf | doi | fatcat-changelog | 5449349 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 634665 + pdf | pmc | fatcat-ingest | 210453 + pdf | arxiv | fatcat-changelog | 200707 + pdf | pmc | fatcat-changelog | 9582 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 197 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 21 + pdf | arxiv | savepapernow-web | 2 + (17 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 170304 + pdf | oai | 15287 + pdf | unpaywall | 1 + (3 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14144314 | 0.276 + pdf | mag | 35015357 | 24811947 | 0.709 + pdf | unpaywall | 27653003 | 22302629 | 0.807 + pdf | doi | 17363369 | 3533568 | 0.204 + pdf | pmc | 2248860 | 1713197 | 0.762 + pdf | arxiv | 835400 | 685219 | 0.820 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 197 | 138 | 0.701 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 58265365 + pdf | no-pdf-link | 27216435 + pdf | no-capture | 21982611 + pdf | redirect-loop | 8457469 + pdf | terminal-bad-status | 2695023 + pdf | link-loop | 2209672 + pdf | wrong-mimetype | 767508 + pdf | gateway-timeout | 548870 + pdf | cdx-error | 391611 + pdf | skip-url-blocklist | 220661 + pdf | null-body | 182215 + pdf | wayback-error | 146869 + pdf | spn2-cdx-lookup-failure | 107229 + pdf | spn-error | 85128 + pdf | invalid-host-resolution | 37352 + pdf | petabox-error | 32490 + pdf | spn2-error | 29212 + pdf | spn-remote-error | 27927 + pdf | other-mimetype | 2305 + pdf | bad-redirect | 1524 + pdf | spn2-error:job-failed | 1521 + pdf | timeout | 842 + pdf | spn2-error:soft-time-limit-exceeded | 793 + pdf | redirects-exceeded | 748 + pdf | spn2-error:invalid-url-syntax | 417 + (25 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) + FROM ingest_file_result + WHERE hit = false + GROUP BY ingest_type, terminal_status_code + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 34064937 + pdf | | 20514531 + pdf | 301 | 7271700 + pdf | 302 | 720632 + pdf | 503 | 712697 + pdf | 400 | 444209 + pdf | 404 | 331495 + pdf | 403 | 323030 + pdf | 401 | 259327 + pdf | 500 | 236122 + pdf | 303 | 101609 + pdf | 429 | 47738 + pdf | 502 | 36183 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14790 + pdf | 999 | 8996 + pdf | 307 | 3769 + pdf | 308 | 3422 + pdf | 202 | 3228 + pdf | 520 | 2058 + pdf | 410 | 1734 + pdf | 521 | 1033 + pdf | 504 | 868 + pdf | 505 | 424 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 6600758 | 5213294 + (1 row) + diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/README.md b/sql/stats/README.md new file mode 100644 index 0000000..62e213c --- /dev/null +++ b/sql/stats/README.md @@ -0,0 +1,120 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + diff --git a/sql/table_sizes.md b/sql/table_sizes.md new file mode 100644 index 0000000..3596b2b --- /dev/null +++ b/sql/table_sizes.md @@ -0,0 +1,11 @@ + +## September 2019 + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."cdx" | 31 GB | 27 GB | 58 GB + "public"."file_meta" | 13 GB | 6500 MB | 19 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."grobid" | 4994 MB | 6678 MB | 11 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB |