aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-04 10:18:20 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-04 10:18:20 -0700
commitba6d3270a8c2bc9b919ca616da919a54ecb168b7 (patch)
tree8ce16477f70471aabf4d5ddfbef93e5aa8543122 /sql
parent748678bc88ea31a362ec5e896fd991b3c8dcbe58 (diff)
downloadsandcrawler-ba6d3270a8c2bc9b919ca616da919a54ecb168b7.tar.gz
sandcrawler-ba6d3270a8c2bc9b919ca616da919a54ecb168b7.zip
update sandcrawler stats for early may
Diffstat (limited to 'sql')
-rw-r--r--sql/stats/2020-05-03_stats.txt418
1 files changed, 418 insertions, 0 deletions
diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt
new file mode 100644
index 0000000..55f0c1e
--- /dev/null
+++ b/sql/stats/2020-05-03_stats.txt
@@ -0,0 +1,418 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 41 GB | 82 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 59 GB | 7604 MB | 66 GB
+ "public"."file_meta" | 31 GB | 28 GB | 59 GB
+ "public"."ingest_request" | 19 GB | 20 GB | 39 GB
+ "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (10 rows)
+
+ Size: 383.93G
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 158059828 | 197346217653010
+ (1 row)
+
+ => 158 million, 197 terabytes
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 157805029
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6040
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1027125
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+
+ unique_sha1 | total
+ -------------+-----------
+ 92936564 | 111022039
+ (1 row)
+
+ => 110 million rows, 92.9 million files
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------------------------------------+-----------
+ application/pdf | 104178718
+ warc/revisit | 5274410
+ text/xml | 519042
+ text/html | 295523
+ application/octet-stream | 259681
+ unk | 138930
+ application/postscript | 81065
+ application/save | 80765
+ binary/octet-stream | 59804
+ application/x-download | 27083
+ text/plain | 26938
+ application/download | 25125
+ image/pdf | 16095
+ application/force-download | 9004
+ application/x-msdownload | 3711
+ application | 2934
+ application/x-octetstream | 2926
+ multipart/form-data | 2741
+ application/x-pdf | 2444
+ .pdf | 2368
+ application/binary | 1268
+ application/pdf' | 1192
+ pdf | 1113
+ file/unknown | 1086
+ application/unknown | 761
+ file | 753
+ application/blob | 670
+ application/octetstream | 657
+ text/pdf | 549
+ 0 | 417
+ ('application/pdf', | 349
+ application/http;msgtype=response | 251
+ application/doc | 180
+ [...] (wasn't LIMIT 25)
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+
+ unique_releases | total
+ -----------------+----------
+ 17455441 | 92707544
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057023
+ | 14638425
+ (2 rows)
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057074
+ | 3
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | unpaywall | 26244088
+ pdf | mag | 25596658
+ pdf | doi | 15652966
+ pdf | pmc | 2043646
+ pdf | arxiv | 721902
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 103
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | unpaywall | unpaywall | 26244088
+ pdf | mag | mag-corpus | 25596658
+ pdf | doi | fatcat-ingest | 8267308
+ pdf | doi | fatcat-changelog | 3869772
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630719
+ pdf | arxiv | fatcat-changelog | 91157
+ pdf | pmc | fatcat-ingest | 10195
+ pdf | pmc | fatcat-changelog | 4626
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 103
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 15
+ (15 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ pdf | mag | 47
+ pdf | unpaywall | 1
+ (2 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | unpaywall | 26244088 | 19968092 | 0.761
+ pdf | mag | 25596658 | 18712912 | 0.731
+ pdf | doi | 15653166 | 2878833 | 0.184
+ pdf | pmc | 2043646 | 1279529 | 0.626
+ pdf | arxiv | 721902 | 592394 | 0.821
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 103 | 82 | 0.796
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 37449502
+ pdf | no-pdf-link | 10908442
+ pdf | no-capture | 5643670
+ pdf | redirect-loop | 4823502
+ pdf | terminal-bad-status | 1715056
+ pdf | link-loop | 1425072
+ pdf | cdx-error | 535365
+ pdf | gateway-timeout | 267654
+ pdf | skip-url-blocklist | 220433
+ pdf | wrong-mimetype | 189804
+ pdf | spn2-cdx-lookup-failure | 103926
+ pdf | spn-error | 101777
+ pdf | wayback-error | 93517
+ pdf | null-body | 87279
+ pdf | invalid-host-resolution | 35305
+ pdf | spn-remote-error | 28888
+ pdf | petabox-error | 12406
+ pdf | spn2-error | 2905
+ pdf | spn2-error:job-failed | 2307
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 745
+ pdf | spn2-error:proxy-error | 438
+ pdf | spn2-error:invalid-url-syntax | 406
+ pdf | spn2-error:soft-time-limit-exceeded | 405
+ pdf | spn2-error:browser-running-error | 274
+ (25 rows)
+
+Failures by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | domain | status | count
+ -------------+---------------------------------------+---------------------+--------
+ pdf | ssl.fao.org | no-pdf-link | 862277
+ pdf | www.researchgate.net | redirect-loop | 749094
+ pdf | www.e-periodica.ch | no-pdf-link | 747370
+ pdf | ieeexplore.ieee.org | redirect-loop | 707482
+ pdf | plutof.ut.ee | no-pdf-link | 685341
+ pdf | www.gbif.org | no-pdf-link | 670905
+ pdf | dlc.library.columbia.edu | no-pdf-link | 508281
+ pdf | figshare.com | no-pdf-link | 400501
+ pdf | onlinelibrary.wiley.com | no-pdf-link | 399187
+ pdf | watermark.silverchair.com | terminal-bad-status | 357188
+ pdf | www.die-bonn.de | redirect-loop | 352903
+ pdf | academic.oup.com | no-pdf-link | 346828
+ pdf | iopscience.iop.org | terminal-bad-status | 345147
+ pdf | linkinghub.elsevier.com | no-capture | 328434
+ pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206
+ pdf | cyberleninka.ru | link-loop | 309525
+ pdf | www.tandfonline.com | no-pdf-link | 309146
+ pdf | dialnet.unirioja.es | terminal-bad-status | 307572
+ pdf | doi.pangaea.de | no-pdf-link | 304924
+ pdf | journals.sagepub.com | no-pdf-link | 285774
+ pdf | papers.ssrn.com | link-loop | 282415
+ pdf | dialnet.unirioja.es | redirect-loop | 274476
+ pdf | ieeexplore.ieee.org | link-loop | 273607
+ pdf | catalog.paradisec.org.au | redirect-loop | 234653
+ pdf | www.plate-archive.org | no-pdf-link | 209217
+ pdf | zenodo.org | no-pdf-link | 200078
+ pdf | zenodo.org | no-capture | 199025
+ pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084
+ pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039
+ pdf | validate.perfdrive.com | no-pdf-link | 180191
+ (30 rows)
+
+Success by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | domain | status | count
+ -------------+----------------------------+---------+---------
+ pdf | www.jstage.jst.go.jp | success | 2244620
+ pdf | europepmc.org | success | 1284770
+ pdf | link.springer.com | success | 1017998
+ pdf | www.scielo.br | success | 799577
+ pdf | arxiv.org | success | 592622
+ pdf | downloads.hindawi.com | success | 527278
+ pdf | res.mdpi.com | success | 501093
+ pdf | hal.archives-ouvertes.fr | success | 447877
+ pdf | digital.library.unt.edu | success | 404460
+ pdf | www.cambridge.org | success | 394666
+ pdf | dergipark.org.tr | success | 373706
+ pdf | journals.plos.org | success | 296994
+ pdf | watermark.silverchair.com | success | 275562
+ pdf | www.nature.com | success | 263836
+ pdf | cds.cern.ch | success | 223057
+ pdf | www.pnas.org | success | 220488
+ pdf | s3-eu-west-1.amazonaws.com | success | 214558
+ pdf | www.jbc.org | success | 205277
+ pdf | www.redalyc.org | success | 193591
+ pdf | iopscience.iop.org | success | 175796
+ pdf | apps.dtic.mil | success | 170589
+ pdf | zenodo.org | success | 167812
+ pdf | peerj.com | success | 155620
+ pdf | www.biorxiv.org | success | 149337
+ pdf | 210.101.116.28 | success | 145706
+ pdf | www.teses.usp.br | success | 145438
+ pdf | absimage.aps.org | success | 144400
+ pdf | hrcak.srce.hr | success | 134669
+ pdf | www.erudit.org | success | 131771
+ pdf | babel.hathitrust.org | success | 130645
+ (30 rows)
+
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ => NOT RUN, fatcat_file table is way out of date
+