aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/2021-11-26_stats.txt
blob: 3a0e561cc272ab95589f37934e7d5a7815cba6b3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424

Date: Sat 27 Nov 2021 03:33:30 AM UTC

## SQL Table Sizes

    Size: 937.28G

    SELECT
        table_name,
        pg_size_pretty(table_size) AS table_size,
        pg_size_pretty(indexes_size) AS indexes_size,
        pg_size_pretty(total_size) AS total_size
      FROM (
          SELECT
              table_name,
              pg_table_size(table_name) AS table_size,
              pg_indexes_size(table_name) AS indexes_size,
              pg_total_relation_size(table_name) AS total_size
          FROM (
              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
              FROM information_schema.tables
              WHERE table_schema = 'public'
          ) AS all_tables
          ORDER BY total_size DESC
      ) AS pretty_sizes;

                 table_name             | table_size | indexes_size | total_size 
    ------------------------------------+------------+--------------+------------
     "public"."crossref"                | 393 GB     | 10127 MB     | 403 GB
     "public"."ingest_request"          | 44 GB      | 41 GB        | 84 GB
     "public"."cdx"                     | 52 GB      | 28 GB        | 80 GB
     "public"."grobid"                  | 72 GB      | 6963 MB      | 79 GB
     "public"."ingest_file_result"      | 38 GB      | 40 GB        | 78 GB
     "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB
     "public"."file_meta"               | 34 GB      | 21 GB        | 55 GB
     "public"."pdf_meta"                | 20 GB      | 5869 MB      | 26 GB
     "public"."grobid_refs"             | 19 GB      | 1690 MB      | 21 GB
     "public"."fatcat_file"             | 12 GB      | 6602 MB      | 18 GB
     "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB
     "public"."html_meta"               | 1200 MB    | 8072 kB      | 1208 MB
     "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB
     "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB
     "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB
     "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes
    (16 rows)


## File Metadata

Counts and total file size:

    SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;

     total_count |   total_size
    -------------+-----------------
       179761501 | 244453538203113

    # 179m files, 244 TB

Top mimetypes:

    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;

                                     mimetype                                  |   count
    ---------------------------------------------------------------------------+-----------
     application/pdf                                                           | 179376819
     application/octet-stream                                                  |    155379
     text/html                                                                 |    116102
     application/xml                                                           |     42170
     application/xhtml+xml                                                     |     24347
     text/plain                                                                |     15990
     application/jats+xml                                                      |      6899
     application/gzip                                                          |      6491
                                                                               |      6034
     application/postscript                                                    |      4912
     application/vnd.ms-powerpoint                                             |      1672
     application/msword                                                        |       921
     application/x-bzip2                                                       |       891
     image/jpeg                                                                |       794
     image/gif                                                                 |       389
     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |       303
     application/x-compress                                                    |       272
     application/zip                                                           |       131
     image/png                                                                 |       121
     application/CDFV2-unknown                                                 |        99
     application/mac-binhex40                                                  |        79
     application/vnd.openxmlformats-officedocument.spreadsheetml.sheet         |        57
     application/x-dosexec                                                     |        51
     text/x-tex                                                                |        44
     application/vnd.openxmlformats-officedocument.presentationml.presentation |        39
     text/x-php                                                                |        37
     text/rtf                                                                  |        33
     application/x-dvi                                                         |        29
     application/x-rar                                                         |        29
     video/mp4                                                                 |        29
    (30 rows)

Missing full metadata:

    SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;

     count
    -------
     62196

## CDX

Total and unique-by-sha1 counts:

    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;

     unique_sha1 |   total
    -------------+-----------
       119049962 | 149169240


mimetype counts:

    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;

              mimetype          |   count
    ----------------------------+-----------
     application/pdf            | 137271670
     warc/revisit               |   9709493
     application/octet-stream   |    590443
     text/xml                   |    525481
     text/html                  |    421030
     unk                        |    207442
     application/postscript     |     81123
     application/save           |     80988
     binary/octet-stream        |     67476
     image/pdf                  |     39419
     application/x-download     |     38278
     text/plain                 |     36159
     application/download       |     34328
     application/force-download |     19729
     multipart/form-data        |      9105
     application                |      5299
     application/x-msdownload   |      3851
     application/x-octetstream  |      3649
     .pdf                       |      3318
     application/x-pdf          |      2992
     pdf                        |      1484
     file                       |      1364
     application/binary         |      1354
     file/unknown               |      1345
     application/pdf'           |      1196
     application/octetstream    |      1029
     application/unknown        |      1000
     0                          |       764
     text/pdf                   |       704
     application/blob           |       673
    (30 rows)

## GROBID

Counts:

    SELECT COUNT(*) AS total_files FROM grobid;

     total_files
    -------------
       111236904

Status?

    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;

     status_code |   count
    -------------+-----------
             200 | 102962304
             500 |   8269129
              -4 |      5013
             503 |       548

TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF
files through by mistake

What version used?

    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;

        grobid_version    |  count
    ----------------------+----------
     0.5.5-fatcat         | 89983404
                          | 12892161
     0.7.0-104-gbeebd9a6b |    86739

## Petabox

Counts:

    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;

     unique_sha1 |  total
    -------------+---------
         2868825 | 2887834

## Ingests

Requests by source:

    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;

     ingest_type |   link_source   |  count   
    -------------+-----------------+----------
     pdf         | oai             | 51185088
     pdf         | mag             | 43701948
     pdf         | unpaywall       | 37802895
     pdf         | doi             | 28736398
     pdf         | doaj            |  4264610
     html        | doaj            |  2429003
     pdf         | pmc             |  2383398
     pdf         | arxiv           |  2330054
     html        | doi             |    39725
     xml         | doaj            |     9442
     pdf         | cnki_covid19    |     2034
     pdf         | wanfang_covid19 |      975
     pdf         | spn             |      689
     html        | spn             |       48
     xml         | spn             |        1
    (15 rows)

    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;

     ingest_type |   link_source   |  ingest_request_source  |  count
    -------------+-----------------+-------------------------+----------
     pdf         | oai             | metha-bulk              | 51185088
     pdf         | mag             | mag-corpus              | 43701948
     pdf         | unpaywall       | unpaywall               | 37802895
     pdf         | doi             | fatcat-changelog        | 16207728
     pdf         | doi             | fatcat-ingest           |  9012282
     pdf         | doaj            | doaj                    |  4264610
     pdf         | doi             | fatcat-ingest-container |  3515873
     html        | doaj            | doaj                    |  2429003
     pdf         | pmc             | fatcat-ingest-container |  2028825
     pdf         | arxiv           | fatcat-ingest           |  1767705
     pdf         | arxiv           | fatcat-changelog        |   562320
     pdf         | pmc             | fatcat-ingest           |   297527
     pdf         | pmc             | fatcat-changelog        |    57046
     html        | doi             | fatcat-ingest           |    37788
     xml         | doaj            | doaj                    |     9442
     pdf         | cnki_covid19    | scrape-covid19          |     2034
     html        | doi             | fatcat-changelog        |     1897
     pdf         | wanfang_covid19 | scrape-covid19          |      975
     pdf         | spn             | savepapernow-web        |      689
     pdf         | doi             | savepapernow-web        |      613
     html        | spn             | savepapernow-web        |       48
     html        | doi             | savepapernow-web        |       40
     pdf         | arxiv           | fatcat-ingest-container |       26
     pdf         | arxiv           | savepapernow-web        |        3
     xml         | spn             | savepapernow-web        |        1
    (25 rows)

Uncrawled requests by source:

    # TODO: verify this?
    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_request.base_url = ingest_file_result.base_url
        AND ingest_request.ingest_type = ingest_file_result.ingest_type
    WHERE ingest_file_result.base_url IS NULL
    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;

     ingest_type | link_source | count  
    -------------+-------------+--------
     pdf         | mag         | 169076
     pdf         | oai         |  15283
     pdf         | doaj        |   2063
     html        | doaj        |    620
     pdf         | doi         |     22
     pdf         | unpaywall   |     17


Results by source:

    SELECT
        ingest_request.ingest_type,
        ingest_request.link_source,
        COUNT(*) as attempts,
        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, 
        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_request.base_url = ingest_file_result.base_url
        AND ingest_request.ingest_type = ingest_file_result.ingest_type
        AND ingest_file_result.ingest_type IS NOT NULL
    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;

     ingest_type |   link_source   | attempts |   hits   | fraction
    -------------+-----------------+----------+----------+----------
     pdf         | oai             | 51185088 | 14554221 |    0.284
     pdf         | mag             | 43701948 | 32643175 |    0.747
     pdf         | unpaywall       | 37802895 | 29989257 |    0.793
     pdf         | doi             | 28736547 |  7690393 |    0.268
     pdf         | doaj            |  4264610 |  2851601 |    0.669
     html        | doaj            |  2429003 |   122937 |    0.051
     pdf         | pmc             |  2383398 |  1821071 |    0.764
     pdf         | arxiv           |  2330054 |  2159738 |    0.927
     html        | doi             |    39725 |     1235 |    0.031
     xml         | doaj            |     9442 |     6897 |    0.730
     pdf         | cnki_covid19    |     2034 |        0 |    0.000
     pdf         | wanfang_covid19 |      975 |      764 |    0.784
     pdf         | spn             |      689 |      503 |    0.730
     html        | spn             |       48 |        5 |    0.104
     xml         | spn             |        1 |        0 |    0.000

Ingest result by status:

    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;


     ingest_type |            status             |  count   
    -------------+-------------------------------+----------
     pdf         | success                       | 78944243
     pdf         | no-pdf-link                   | 26270027
     pdf         | no-capture                    | 23267156
     pdf         | redirect-loop                 |  9837466
     pdf         | terminal-bad-status           |  4147454
     pdf         | skip-url-blocklist            |  3088907
     pdf         | link-loop                     |  2953891
     pdf         | blocked-cookie                |  1855541
     html        | wrong-scope                   |  1106171
     pdf         | wrong-mimetype                |   859941
     pdf         | gateway-timeout               |   729771
     pdf         | spn2-cdx-lookup-failure       |   584856
     html        | no-capture                    |   423917
     pdf         | forbidden                     |   390804
     pdf         | cdx-error                     |   363091
     pdf         | wayback-content-error         |   354894
     pdf         | null-body                     |   341698
     pdf         | too-many-redirects            |   307096
     pdf         | not-found                     |   294592
     html        | redirect-loop                 |   213032
     html        | unknown-scope                 |   207923
     pdf         | spn2-error                    |   192046
     html        | html-resource-no-capture      |   166119
     html        | success                       |   124177
     pdf         | wayback-error                 |   105385
     html        | null-body                     |   100296
     pdf         | spn2-wayback-error            |    73176
     pdf         | remote-server-error           |    60908
     pdf         | spn2-error:too-many-redirects |    58076
     pdf         | skip-wall                     |    57744
     html        | wayback-content-error         |    53928
     pdf         | read-timeout                  |    42465
     pdf         | invalid-host-resolution       |    37221
     pdf         | petabox-error                 |    28765
     pdf         | spn2-error:unknown            |    23885
     html        | wrong-mimetype                |    18930
     pdf         | bad-redirect                  |    14708
     html        | terminal-bad-status           |    14070
     html        | petabox-error                 |    13770
     html        | spn2-cdx-lookup-failure       |    13002
     pdf         | spn2-error:job-failed         |     9721
     html        | cdx-error                     |     7167
     xml         | success                       |     6897
     pdf         | spn2-error:bad-request        |     4433
     pdf         | spn-remote-error              |     4206
     pdf         | body-too-large                |     3019
     xml         | null-body                     |     2353
     pdf         | other-mimetype                |     2304
     pdf         | error                         |     1900
     pdf         | spn2-error:proxy-error        |     1850
    (50 rows)

Failed ingest by terminal status code:

    SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;

     ingest_type | terminal_status_code |  count
    -------------+----------------------+----------
     pdf         |                  200 | 36821458
     pdf         |                      | 26058729
     pdf         |                  301 |  8466302
     html        |                  200 |  1676730
     pdf         |                  503 |  1028504
     pdf         |                  302 |   949465
     pdf         |                  403 |   936737
     pdf         |                  404 |   687661
     pdf         |                  400 |   507303
     html        |                      |   439356
     pdf         |                  401 |   288994
     pdf         |                  500 |   263775
     html        |                  301 |   211796
     pdf         |                  303 |   130719
     pdf         |                  410 |    66495
     pdf         |                  502 |    41760
     pdf         |                  429 |    35266
     pdf         |                  420 |    26722
     pdf         |                  409 |    15204
     pdf         |                  509 |    15113
     pdf         |                  999 |    11409
     html        |                  404 |     9578
     pdf         |                  307 |     8404
     pdf         |                  308 |     5514
     pdf         |                  202 |     4724
     html        |                  500 |     3628
     xml         |                  200 |     2537
     pdf         |                  520 |     2199
     pdf         |                  206 |     1694
     html        |                  302 |     1138
     pdf         |                  504 |     1124
     pdf         |                  521 |     1085
     pdf         |                  412 |      921
     pdf         |                  421 |      714
     pdf         |                  300 |      461
     pdf         |                  505 |      436
     pdf         |                  406 |      427
     pdf         |                  508 |      408
     html        |                  403 |      382
     html        |                  503 |      378
     html        |                  303 |      268
     pdf         |                  204 |      252
     pdf         |                  226 |      166
     pdf         |                  402 |       70
     html        |                  502 |       68
     pdf         |                  523 |       55
     pdf         |                  408 |       53
     pdf         |                  432 |       45
     pdf         |                  530 |       31
     pdf         |                  416 |       31
    (50 rows)