aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-09-07 17:53:50 -0700
committerBryan Newbold <bnewbold@archive.org>2022-09-07 17:53:50 -0700
commit4589f574ae8fb29d959ff60e99a15098fee8a15a (patch)
treecaebfc418dea6cc06fef5f60cc8b7435f1099e37 /sql
parent49836dba33bfb481213c03fe4f84c4d876d2d3bd (diff)
downloadsandcrawler-4589f574ae8fb29d959ff60e99a15098fee8a15a.tar.gz
sandcrawler-4589f574ae8fb29d959ff60e99a15098fee8a15a.zip
sandcrawler SQL-based status (sept 2022)
Diffstat (limited to 'sql')
-rw-r--r--sql/stats/2022-09-06_stats.txt438
1 files changed, 438 insertions, 0 deletions
diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt
new file mode 100644
index 0000000..be2b30c
--- /dev/null
+++ b/sql/stats/2022-09-06_stats.txt
@@ -0,0 +1,438 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 62 GB | 44 GB | 106 GB
+ "public"."ingest_request" | 51 GB | 50 GB | 101 GB
+ "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB
+ "public"."file_meta" | 39 GB | 39 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7469 MB | 66 MB | 7535 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 198175106 | 282695671015403
+ (1 row)
+
+ 198 million files, 282 TBytes.
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 197021437
+ text/html | 830331
+ application/octet-stream | 186669
+ application/xml | 42170
+ application/xhtml+xml | 38207
+ text/plain | 16471
+ application/jats+xml | 10385
+ application/gzip | 6681
+ | 6032
+ application/postscript | 4916
+ image/jpeg | 4522
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 946
+ application/x-bzip2 | 891
+ image/png | 659
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382
+ application/x-compress | 274
+ video/mp4 | 218
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12800
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 137283420 | 172140506
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 157465613
+ warc/revisit | 11337336
+ text/html | 1137208
+ application/octet-stream | 950380
+ text/xml | 528965
+ unk | 253294
+ application/postscript | 81130
+ application/save | 81069
+ binary/octet-stream | 68942
+ application/x-download | 42717
+ application/download | 40628
+ image/pdf | 39904
+ text/plain | 36445
+ application/force-download | 24148
+ multipart/form-data | 10972
+ application | 5409
+ application/x-octetstream | 5192
+ application/x-msdownload | 3854
+ .pdf | 3518
+ application/x-pdf | 3061
+ application/octet | 1792
+ pdf | 1757
+ application/binary | 1399
+ file | 1373
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1087
+ application/unknown | 1005
+ 0 | 773
+ text/pdf | 729
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 129001717
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 120797098
+ 500 | 8198783
+ -4 | 5802
+ 503 | 36
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 60469462
+ 0.5.5-fatcat | 47472904
+ | 12665498
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | doi | 43852308
+ pdf | mag | 43701948
+ pdf | doaj | 6534341
+ html | doaj | 3987669
+ pdf | arxiv | 2784589
+ pdf | pmc | 2439181
+ pdf | dblp | 631716
+ html | doi | 126699
+ xml | doaj | 23066
+ pdf | cnki_covid19 | 2034
+ pdf | spn | 1026
+ pdf | wanfang_covid19 | 975
+ html | spn | 65
+ xml | spn | 2
+ xml | doi | 1
+ (17 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 24742500
+ pdf | doi | fatcat-ingest | 15592121
+ pdf | doaj | doaj | 6484737
+ html | doaj | doaj | 3987468
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 799793
+ pdf | dblp | dblp | 631716
+ pdf | pmc | fatcat-ingest | 297980
+ html | doi | fatcat-ingest | 121508
+ pdf | pmc | fatcat-changelog | 112376
+ pdf | doaj | fatcat-changelog | 47181
+ xml | doaj | doaj | 23066
+ html | doi | fatcat-changelog | 5129
+ pdf | doaj | fatcat-ingest | 2423
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1814
+ pdf | spn | savepapernow-web | 1026
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ html | doaj | fatcat-ingest | 201
+ html | spn | savepapernow-web | 65
+ html | doi | savepapernow-web | 62
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 4
+ xml | spn | savepapernow-web | 2
+ xml | doi | savepapernow-web | 1
+ (30 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 167653
+ pdf | doaj | 81517
+ pdf | oai | 15282
+ html | doaj | 1791
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (6 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 16024068 | 0.313
+ pdf | unpaywall | 43932525 | 36045446 | 0.820
+ pdf | doi | 43852308 | 14956080 | 0.341
+ pdf | mag | 43701948 | 32768484 | 0.750
+ pdf | doaj | 6534341 | 4704066 | 0.720
+ html | doaj | 3987669 | 778165 | 0.195
+ pdf | arxiv | 2784589 | 2419941 | 0.869
+ pdf | pmc | 2439181 | 1897671 | 0.778
+ pdf | dblp | 631716 | 305142 | 0.483
+ html | doi | 126699 | 75754 | 0.598
+ xml | doaj | 23066 | 10381 | 0.450
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | spn | 1026 | 778 | 0.758
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ html | spn | 65 | 13 | 0.200
+ xml | spn | 2 | 1 | 0.500
+ xml | doi | 1 | 0 | 0.000
+ (17 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 94887295
+ pdf | no-pdf-link | 33960080
+ pdf | no-capture | 20893916
+ pdf | terminal-bad-status | 6973765
+ pdf | redirect-loop | 5775175
+ pdf | link-loop | 4095424
+ pdf | skip-url-blocklist | 4037518
+ pdf | blocked-cookie | 3508762
+ html | wrong-scope | 1783694
+ pdf | wrong-mimetype | 1379673
+ html | success | 853762
+ pdf | gateway-timeout | 635170
+ html | no-capture | 381283
+ pdf | wayback-content-error | 356694
+ pdf | cdx-error | 347700
+ pdf | null-body | 336166
+ html | unknown-scope | 321874
+ html | html-resource-no-capture | 294294
+ pdf | forbidden | 291127
+ pdf | not-found | 274343
+ pdf | too-many-redirects | 264494
+ component | wrong-mimetype | 196680
+ component | spn2-cdx-lookup-failure | 173615
+ component | spn2-backoff | 115840
+ html | terminal-bad-status | 106264
+ html | null-body | 100296
+ pdf | wayback-error | 94748
+ html | blocked-cookie | 88537
+ component | no-capture | 75278
+ pdf | empty-blob | 61157
+ pdf | bad-redirect | 58680
+ pdf | skip-wall | 57751
+ pdf | spn2-error:too-many-redirects | 52873
+ html | spn2-backoff | 50577
+ pdf | remote-server-error | 41282
+ pdf | invalid-host-resolution | 38864
+ pdf | read-timeout | 37071
+ pdf | spn2-cdx-lookup-failure | 34229
+ html | wrong-mimetype | 33643
+ pdf | spn2-backoff | 32437
+ pdf | petabox-error | 31006
+ html | wayback-content-error | 28034
+ component | spn2-error | 27044
+ pdf | spn2-error:unknown | 25810
+ component | gateway-timeout | 25215
+ pdf | body-too-large | 21721
+ html | petabox-error | 18313
+ html | empty-blob | 14393
+ html | redirect-loop | 13404
+ component | blocked-cookie | 12287
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 45052391
+ pdf | | 26117481
+ pdf | 301 | 4814786
+ html | 200 | 2684821
+ pdf | 403 | 1871088
+ pdf | 404 | 1254259
+ pdf | 302 | 898728
+ pdf | 503 | 867548
+ pdf | 401 | 851205
+ pdf | 429 | 741869
+ pdf | 400 | 624519
+ component | | 456915
+ html | | 442051
+ pdf | 500 | 283700
+ component | 200 | 197510
+ pdf | 410 | 120647
+ pdf | 303 | 107947
+ html | 404 | 80114
+ pdf | 420 | 26722
+ pdf | 502 | 19500
+ pdf | 409 | 15499
+ html | 429 | 15208
+ pdf | 509 | 15167
+ pdf | 999 | 12186
+ pdf | 202 | 11535
+ html | 301 | 10213
+ xml | | 10018
+ pdf | 307 | 8657
+ pdf | 402 | 8338
+ pdf | 412 | 8064
+ pdf | 308 | 6479
+ html | 500 | 4746
+ xml | 200 | 2668
+ pdf | 520 | 2496
+ html | 302 | 2289
+ pdf | 521 | 2257
+ html | 202 | 2177
+ pdf | 206 | 1961
+ html | 403 | 1775
+ pdf | 504 | 1187
+ pdf | 421 | 1148
+ html | 303 | 1112
+ pdf | 406 | 1109
+ pdf | 204 | 772
+ pdf | 432 | 745
+ pdf | 405 | 633
+ html | 400 | 632
+ pdf | 426 | 515
+ pdf | 508 | 503
+ pdf | 505 | 469
+ (50 rows)