aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 21:01:25 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 21:01:25 -0800
commit1b74e8a4dee21bd260040dad8072e4fb48456b3c (patch)
tree36a5c8f263ff7a1637754816c0d76a226e77532d
parentbe8f1d134681caaa15485246b65551a67e5bd5a5 (diff)
downloadsandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.tar.gz
sandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.zip
recent sandcrawler-db / ingest stats (interesting)
-rw-r--r--sql/stats/2020-02-24_stats.txt482
-rw-r--r--sql/stats/README.md6
2 files changed, 488 insertions, 0 deletions
diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt
new file mode 100644
index 0000000..e7a00e8
--- /dev/null
+++ b/sql/stats/2020-02-24_stats.txt
@@ -0,0 +1,482 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ Size: 271.83G
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB
+ "public"."grobid" | 47 GB | 7213 MB | 54 GB
+ "public"."file_meta" | 26 GB | 12 GB | 38 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB
+ "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."pdftrio" | 78 MB | 64 MB | 142 MB
+ (10 rows)
+
+
+## File Metadata
+
+(skipping, no update)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 15,632,810 | 76,555,791
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 70656028
+ 500 | 5896836
+ -4 | 2295
+ 503 | 111
+ (4 rows)
+
+ What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 56001631
+ | 14654496
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2,868,825 | 2,887,834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591633
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ pdf | pubmed | 2
+ (6 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | | 2943896
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | | 629719
+ pdf | doi | fatcat-changelog | 129932
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | arxiv | fatcat-ingest | 998
+ pdf | spn | | 64
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | spn | savepapernow-web | 18
+ pdf | pubmed | | 2
+ pdf | doi | savepapernow-web | 1
+ (14 rows)
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ => UPDATE 2943896
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ => UPDATE 64
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ => UPDATE 629719
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ => UPDATE 1454
+
+ SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed';
+ DELETE FROM ingest_request WHERE link_source = 'pubmed';
+ => DELETE 2
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591637
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ (5 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | fatcat-changelog | 3073828
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630717
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | fatcat-ingest | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | spn | savepapernow-web | 82
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 1
+ (10 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ none?
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+---------+----------
+ pdf | doi | 6591637 | 1622702 | 0.246
+ pdf | pmc | 2030279 | 1241836 | 0.612
+ pdf | arxiv | 630743 | 500620 | 0.794
+ pdf | unpaywall | 1400 | 851 | 0.608
+ pdf | spn | 82 | 62 | 0.756
+ (5 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+---------
+ pdf | success | 3366189
+ pdf | no-pdf-link | 2902620
+ pdf | no-capture | 1672025
+ pdf | redirect-loop | 388844
+ pdf | cdx-error | 272780
+ pdf | terminal-bad-status | 171878
+ pdf | spn-remote-error | 163843
+ pdf | spn-error | 108070
+ pdf | null-body | 66778
+ pdf | link-loop | 43403
+ pdf | skip-url-blocklist | 34705
+ pdf | wrong-mimetype | 31343
+ pdf | wayback-error | 13012
+ pdf | spn2-cdx-lookup-failure | 6100
+ pdf | gateway-timeout | 5633
+ pdf | other-mimetype | 5114
+ pdf | spn2-error:proxy-error | 538
+ pdf | spn2-error:job-failed | 470
+ pdf | petabox-error | 415
+ pdf | spn2-error:browser-running-error | 136
+ pdf | spn2-error | 127
+ pdf | spn2-error:soft-time-limit-exceeded | 71
+ pdf | bad-redirect | 39
+ pdf | spn2-error:unknown | 30
+ pdf | spn2-error:browsing-timeout | 25
+ pdf | pending | 3
+ pdf | invalid-host-resolution | 1
+ (27 rows)
+
+
+## Fatcat Files
+
+(skipping, no update)
+
+## Recent Success/Failure of Ingest by Domain
+
+NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a
+small fraction of this is from changelog.
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------------+----------------+-------
+ linkinghub.elsevier.com | no-capture | 2579
+ www.mdpi.com | wrong-mimetype | 1313
+ onlinelibrary.wiley.com | no-pdf-link | 785
+ americanarchivist.org | no-pdf-link | 756
+ journals.sagepub.com | redirect-loop | 503
+ link.springer.com | redirect-loop | 432
+ iopscience.iop.org | no-capture | 392
+ www.tandfonline.com | no-pdf-link | 389
+ pubs.rsc.org | no-capture | 361
+ www.persee.fr | no-capture | 344
+ (10 rows)
+
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | success | 40578
+ pdf | cdx-error | 14982
+ pdf | no-capture | 7747
+ pdf | no-pdf-link | 7111
+ pdf | redirect-loop | 3265
+ pdf | wrong-mimetype | 1629
+ pdf | spn2-cdx-lookup-failure | 657
+ pdf | link-loop | 538
+ pdf | null-body | 517
+ pdf | terminal-bad-status | 400
+ pdf | wayback-error | 79
+ pdf | spn2-error:job-failed | 53
+ pdf | gateway-timeout | 38
+ pdf | spn2-error:soft-time-limit-exceeded | 7
+ pdf | spn2-error | 6
+ pdf | petabox-error | 5
+ pdf | spn2-error:browsing-timeout | 4
+ pdf | spn2-error:unknown | 2
+ pdf | bad-redirect | 1
+ pdf | pending | 1
+ (20 rows)
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+---------+---------
+ pdf | 2020-02-25 | 32660 | 14322
+ pdf | 2020-02-24 | 44967 | 26263
+ pdf | 2020-02-23 | 58795 | 18874
+ pdf | 2020-02-22 | 844249 | 272606
+ pdf | 2020-02-21 | 1287378 | 433487
+ pdf | 2020-02-20 | 1455943 | 492408
+ pdf | 2020-02-19 | 21453 | 7529
+ pdf | 2020-02-18 | 5863 | 2926
+ pdf | 2020-02-17 | 3737 | 970
+ pdf | 2020-02-16 | 13779 | 4862
+ pdf | 2020-02-15 | 1021020 | 623020
+ pdf | 2020-02-14 | 1036036 | 632830
+ pdf | 2020-02-13 | 13503 | 5824
+ pdf | 2020-02-12 | 20078 | 11422
+ pdf | 2020-02-11 | 13499 | 6781
+ pdf | 2020-02-10 | 2275 | 961
+ pdf | 2020-02-09 | 3231 | 1494
+ pdf | 2020-02-08 | 8967 | 4400
+ pdf | 2020-02-07 | 7022 | 2430
+ pdf | 2020-02-06 | 1291 | 516
+ pdf | 2020-02-05 | 8586 | 6596
+ pdf | 2020-02-04 | 3681 | 3593
+ pdf | 2020-02-03 | 284 | 284
+ pdf | 2020-02-02 | 480 | 480
+ pdf | 2020-02-01 | 489 | 336
+ pdf | 2020-01-31 | 1187 | 1130
+ pdf | 2020-01-30 | 1613 | 1288
+ pdf | 2020-01-29 | 947 | 279
+ pdf | 2020-01-28 | 667 | 323
+ (29 rows)
+
+Top "no-capture" domains (will need to re-ingest using live tool):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------+------------+--------
+ linkinghub.elsevier.com | no-capture | 320065
+ iopscience.iop.org | no-capture | 46858
+ pubs.rsc.org | no-capture | 43331
+ www.persee.fr | no-capture | 38971
+ www.doiserbia.nb.rs | no-capture | 27112
+ academic.oup.com | no-capture | 18877
+ www.osapublishing.org | no-capture | 17113
+ osf.io | no-capture | 16978
+ scripts.iucr.org | no-capture | 14844
+ www.degruyter.com | no-capture | 8093
+ mab-online.nl | no-capture | 6603
+ insights.ovid.com | no-capture | 6457
+ ir.lib.uth.gr | no-capture | 3625
+ www.sciencedirect.com | no-capture | 3244
+ www.tandfonline.com | no-capture | 3201
+ www.ccsenet.org | no-capture | 2849
+ www.intechopen.com | no-capture | 2813
+ primary-hospital-care.ch | no-capture | 2774
+ www.nature.com | no-capture | 2484
+ www.indianjournals.com | no-capture | 2432
+ journals.aps.org | no-capture | 2197
+ journals.sagepub.com | no-capture | 2064
+ www.episodes.org | no-capture | 1805
+ periodicos.uninove.br | no-capture | 1692
+ escholarship.org | no-capture | 1666
+ (25 rows)
+
+Top "no-pdf-link" domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-pdf-link'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -----------------------------+-------------+--------
+ plutof.ut.ee | no-pdf-link | 685315
+ www.gbif.org | no-pdf-link | 670647
+ doi.pangaea.de | no-pdf-link | 301984
+ www.plate-archive.org | no-pdf-link | 209218
+ onlinelibrary.wiley.com | no-pdf-link | 84890
+ figshare.com | no-pdf-link | 72892
+ zenodo.org | no-pdf-link | 45768
+ www.tandfonline.com | no-pdf-link | 43848
+ data.mendeley.com | no-pdf-link | 42367
+ springernature.figshare.com | no-pdf-link | 35941
+ dhz.uni-passau.de | no-pdf-link | 29187
+ www.frontiersin.org | no-pdf-link | 17925
+ digital.ucd.ie | no-pdf-link | 16769
+ mr.crossref.org | no-pdf-link | 14999
+ journals.lww.com | no-pdf-link | 12122
+ musewide.aip.de | no-pdf-link | 10854
+ datadryad.org | no-pdf-link | 10686
+ www.jstor.org | no-pdf-link | 9159
+ koreascience.or.kr | no-pdf-link | 9067
+ easy.dans.knaw.nl | no-pdf-link | 8264
+ scielo.conicyt.cl | no-pdf-link | 8069
+ www.degruyter.com | no-pdf-link | 7989
+ www.kci.go.kr | no-pdf-link | 6990
+ www.m-hikari.com | no-pdf-link | 6941
+ cshprotocols.cshlp.org | no-pdf-link | 6553
+ (25 rows)
+
+Top block-ish domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status')
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ ---------------------------------+---------------------+-------
+ journals.openedition.org | redirect-loop | 30395
+ ieeexplore.ieee.org | redirect-loop | 28926
+ www.degruyter.com | redirect-loop | 18891
+ www.cairn.info | link-loop | 8919
+ www.frontiersin.org | terminal-bad-status | 6786
+ projecteuclid.org | link-loop | 6098
+ www.mdpi.com | terminal-bad-status | 5189
+ medicalforum.ch | terminal-bad-status | 4596
+ jrnl.nau.edu.ua | link-loop | 4238
+ www.revistas.unam.mx | link-loop | 3926
+ journals.aps.org | redirect-loop | 3696
+ www.ijcseonline.org | redirect-loop | 3567
+ www.researchsquare.com | terminal-bad-status | 3453
+ www.persee.fr | terminal-bad-status | 3221
+ www.baltistica.lt | link-loop | 2098
+ osf.io | redirect-loop | 2004
+ seer.ufrgs.br | terminal-bad-status | 2002
+ jtd.amegroups.com | link-loop | 1738
+ www.hindawi.com | terminal-bad-status | 1613
+ linkinghub.elsevier.com | redirect-loop | 1612
+ www.scienceopen.com | terminal-bad-status | 1580
+ atm.amegroups.com | link-loop | 1571
+ scielo.conicyt.cl | terminal-bad-status | 1491
+ repozytorium.ur.edu.pl | redirect-loop | 1279
+ agupubs.onlinelibrary.wiley.com | link-loop | 1182
+ (25 rows)
+
diff --git a/sql/stats/README.md b/sql/stats/README.md
index ea61fa0..52642f6 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -41,6 +41,10 @@ Total and unique-by-sha1 counts:
SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
+
Processed or not:
# TODO:
@@ -71,6 +75,8 @@ Requests by source:
SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
Uncrawled requests by source:
# TODO: verify this?