aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/2020-02-24_stats.txt
blob: e7a00e85f4f790a28acd80b1a0ad1bbb80908de7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482

## SQL Table Sizes

    SELECT
        table_name,
        pg_size_pretty(table_size) AS table_size,
        pg_size_pretty(indexes_size) AS indexes_size,
        pg_size_pretty(total_size) AS total_size
      FROM (
          SELECT
              table_name,
              pg_table_size(table_name) AS table_size,
              pg_indexes_size(table_name) AS indexes_size,
              pg_total_relation_size(table_name) AS total_size
          FROM (
              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
              FROM information_schema.tables
              WHERE table_schema = 'public'
          ) AS all_tables
          ORDER BY total_size DESC
      ) AS pretty_sizes;


    Size:  271.83G

              table_name           | table_size | indexes_size | total_size 
    -------------------------------+------------+--------------+------------
     "public"."cdx"                | 42 GB      | 36 GB        | 78 GB
     "public"."grobid_shadow"      | 61 GB      | 6553 MB      | 68 GB
     "public"."grobid"             | 47 GB      | 7213 MB      | 54 GB
     "public"."file_meta"          | 26 GB      | 12 GB        | 38 GB
     "public"."shadow"             | 8303 MB    | 9216 MB      | 17 GB
     "public"."fatcat_file"        | 5206 MB    | 2094 MB      | 7300 MB
     "public"."ingest_file_result" | 1831 MB    | 2454 MB      | 4285 MB
     "public"."ingest_request"     | 2006 MB    | 2122 MB      | 4128 MB
     "public"."petabox"            | 403 MB     | 594 MB       | 997 MB
     "public"."pdftrio"            | 78 MB      | 64 MB        | 142 MB
    (10 rows)


## File Metadata

(skipping, no update)


## CDX

Total and unique-by-sha1 counts:

    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;

Processed or not:

    # TODO:

## GROBID

Counts:

    SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total  FROM grobid;

     unique_releases |  total
    -----------------+----------
          15,632,810 | 76,555,791
    (1 row)

Status?

    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;

     status_code |  count   
    -------------+----------
             200 | 70656028
             500 |  5896836
              -4 |     2295
             503 |      111
    (4 rows)
 
 What version used?
 
    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;

     grobid_version |  count
    ----------------+----------
     0.5.5-fatcat   | 56001631
                    | 14654496
    (2 rows)

## Petabox

Counts:

    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;

     unique_sha1 |  total
    -------------+---------
       2,868,825 | 2,887,834
    (1 row)

## Ingests

Requests by source:

    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;

     ingest_type | link_source |  count  
    -------------+-------------+---------
     pdf         | doi         | 6591633
     pdf         | pmc         | 2030279
     pdf         | arxiv       |  630743
     pdf         | unpaywall   |    1400
     pdf         | spn         |      82
     pdf         | pubmed      |       2
    (6 rows)

    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;

     ingest_type | link_source |  ingest_request_source  |  count
    -------------+-------------+-------------------------+---------
     pdf         | doi         | fatcat-ingest-container | 3515873
     pdf         | doi         |                         | 2943896
     pdf         | pmc         | fatcat-ingest-container | 2028825
     pdf         | arxiv       |                         |  629719
     pdf         | doi         | fatcat-changelog        |  129932
     pdf         | doi         | fatcat-ingest           |    1935
     pdf         | pmc         |                         |    1454
     pdf         | unpaywall   | unpaywall               |    1400
     pdf         | arxiv       | fatcat-ingest           |     998
     pdf         | spn         |                         |      64
     pdf         | arxiv       | fatcat-ingest-container |      26
     pdf         | spn         | savepapernow-web        |      18
     pdf         | pubmed      |                         |       2
     pdf         | doi         | savepapernow-web        |       1
    (14 rows)

    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
    UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
    => UPDATE 2943896

    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
    UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
    => UPDATE 64

    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
    UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
    => UPDATE 629719

    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
    UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
    => UPDATE 1454

    SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed';
    DELETE FROM ingest_request WHERE link_source = 'pubmed';
    => DELETE 2

    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;

         ingest_type | link_source |  count  
    -------------+-------------+---------
     pdf         | doi         | 6591637
     pdf         | pmc         | 2030279
     pdf         | arxiv       |  630743
     pdf         | unpaywall   |    1400
     pdf         | spn         |      82
    (5 rows)

    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;

     ingest_type | link_source |  ingest_request_source  |  count  
    -------------+-------------+-------------------------+---------
     pdf         | doi         | fatcat-ingest-container | 3515873
     pdf         | doi         | fatcat-changelog        | 3073828
     pdf         | pmc         | fatcat-ingest-container | 2028825
     pdf         | arxiv       | fatcat-ingest           |  630717
     pdf         | doi         | fatcat-ingest           |    1935
     pdf         | pmc         | fatcat-ingest           |    1454
     pdf         | unpaywall   | unpaywall               |    1400
     pdf         | spn         | savepapernow-web        |      82
     pdf         | arxiv       | fatcat-ingest-container |      26
     pdf         | doi         | savepapernow-web        |       1
    (10 rows)

Uncrawled requests by source:

    # TODO: verify this?
    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_request.base_url = ingest_file_result.base_url
        AND ingest_request.ingest_type = ingest_file_result.ingest_type
    WHERE ingest_file_result.base_url IS NULL
    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;

    none?

Results by source:

    SELECT
        ingest_request.ingest_type,
        ingest_request.link_source,
        COUNT(*) as attempts,
        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, 
        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_request.base_url = ingest_file_result.base_url
        AND ingest_request.ingest_type = ingest_file_result.ingest_type
        AND ingest_file_result.ingest_type IS NOT NULL
    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;


     ingest_type | link_source | attempts |  hits   | fraction 
    -------------+-------------+----------+---------+----------
     pdf         | doi         |  6591637 | 1622702 |    0.246
     pdf         | pmc         |  2030279 | 1241836 |    0.612
     pdf         | arxiv       |   630743 |  500620 |    0.794
     pdf         | unpaywall   |     1400 |     851 |    0.608
     pdf         | spn         |       82 |      62 |    0.756
    (5 rows)

Ingest result by status:

    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;

     ingest_type |               status                |  count  
    -------------+-------------------------------------+---------
     pdf         | success                             | 3366189
     pdf         | no-pdf-link                         | 2902620
     pdf         | no-capture                          | 1672025
     pdf         | redirect-loop                       |  388844
     pdf         | cdx-error                           |  272780
     pdf         | terminal-bad-status                 |  171878
     pdf         | spn-remote-error                    |  163843
     pdf         | spn-error                           |  108070
     pdf         | null-body                           |   66778
     pdf         | link-loop                           |   43403
     pdf         | skip-url-blocklist                  |   34705
     pdf         | wrong-mimetype                      |   31343
     pdf         | wayback-error                       |   13012
     pdf         | spn2-cdx-lookup-failure             |    6100
     pdf         | gateway-timeout                     |    5633
     pdf         | other-mimetype                      |    5114
     pdf         | spn2-error:proxy-error              |     538
     pdf         | spn2-error:job-failed               |     470
     pdf         | petabox-error                       |     415
     pdf         | spn2-error:browser-running-error    |     136
     pdf         | spn2-error                          |     127
     pdf         | spn2-error:soft-time-limit-exceeded |      71
     pdf         | bad-redirect                        |      39
     pdf         | spn2-error:unknown                  |      30
     pdf         | spn2-error:browsing-timeout         |      25
     pdf         | pending                             |       3
     pdf         | invalid-host-resolution             |       1
    (27 rows)


## Fatcat Files

(skipping, no update)

## Recent Success/Failure of Ingest by Domain

NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a
small fraction of this is from changelog.

    # "problem domains" and statuses
    SELECT domain, status, COUNT((domain, status))     
    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
    WHERE t1.domain != ''
        AND t1.status != 'success'
        AND t1.updated >= NOW() - '1 day'::INTERVAL    
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 10;

             domain          |     status     | count
    -------------------------+----------------+-------
     linkinghub.elsevier.com | no-capture     |  2579
     www.mdpi.com            | wrong-mimetype |  1313
     onlinelibrary.wiley.com | no-pdf-link    |   785
     americanarchivist.org   | no-pdf-link    |   756
     journals.sagepub.com    | redirect-loop  |   503
     link.springer.com       | redirect-loop  |   432
     iopscience.iop.org      | no-capture     |   392
     www.tandfonline.com     | no-pdf-link    |   389
     pubs.rsc.org            | no-capture     |   361
     www.persee.fr           | no-capture     |   344
    (10 rows)


    # "what type of errors"
    SELECT ingest_type, status, COUNT(*)
    FROM ingest_file_result
    WHERE updated >= NOW() - '1 day'::INTERVAL
    GROUP BY ingest_type, status
    ORDER BY COUNT DESC
    LIMIT 25;    

     ingest_type |               status                | count 
    -------------+-------------------------------------+-------
     pdf         | success                             | 40578
     pdf         | cdx-error                           | 14982
     pdf         | no-capture                          |  7747
     pdf         | no-pdf-link                         |  7111
     pdf         | redirect-loop                       |  3265
     pdf         | wrong-mimetype                      |  1629
     pdf         | spn2-cdx-lookup-failure             |   657
     pdf         | link-loop                           |   538
     pdf         | null-body                           |   517
     pdf         | terminal-bad-status                 |   400
     pdf         | wayback-error                       |    79
     pdf         | spn2-error:job-failed               |    53
     pdf         | gateway-timeout                     |    38
     pdf         | spn2-error:soft-time-limit-exceeded |     7
     pdf         | spn2-error                          |     6
     pdf         | petabox-error                       |     5
     pdf         | spn2-error:browsing-timeout         |     4
     pdf         | spn2-error:unknown                  |     2
     pdf         | bad-redirect                        |     1
     pdf         | pending                             |     1
    (20 rows)

    # "throughput per day for last N days"
    SELECT ingest_type,
           date(updated),
           COUNT(*) as total,
           COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
    FROM ingest_file_result
    WHERE updated >= NOW() - '1 month'::INTERVAL
    GROUP BY ingest_type, date(updated)
    ORDER BY date(updated) DESC;

     ingest_type |    date    |  total  | success 
    -------------+------------+---------+---------
     pdf         | 2020-02-25 |   32660 |   14322
     pdf         | 2020-02-24 |   44967 |   26263
     pdf         | 2020-02-23 |   58795 |   18874
     pdf         | 2020-02-22 |  844249 |  272606
     pdf         | 2020-02-21 | 1287378 |  433487
     pdf         | 2020-02-20 | 1455943 |  492408
     pdf         | 2020-02-19 |   21453 |    7529
     pdf         | 2020-02-18 |    5863 |    2926
     pdf         | 2020-02-17 |    3737 |     970
     pdf         | 2020-02-16 |   13779 |    4862
     pdf         | 2020-02-15 | 1021020 |  623020
     pdf         | 2020-02-14 | 1036036 |  632830
     pdf         | 2020-02-13 |   13503 |    5824
     pdf         | 2020-02-12 |   20078 |   11422
     pdf         | 2020-02-11 |   13499 |    6781
     pdf         | 2020-02-10 |    2275 |     961
     pdf         | 2020-02-09 |    3231 |    1494
     pdf         | 2020-02-08 |    8967 |    4400
     pdf         | 2020-02-07 |    7022 |    2430
     pdf         | 2020-02-06 |    1291 |     516
     pdf         | 2020-02-05 |    8586 |    6596
     pdf         | 2020-02-04 |    3681 |    3593
     pdf         | 2020-02-03 |     284 |     284
     pdf         | 2020-02-02 |     480 |     480
     pdf         | 2020-02-01 |     489 |     336
     pdf         | 2020-01-31 |    1187 |    1130
     pdf         | 2020-01-30 |    1613 |    1288
     pdf         | 2020-01-29 |     947 |     279
     pdf         | 2020-01-28 |     667 |     323
    (29 rows)

Top "no-capture" domains (will need to re-ingest using live tool):

    SELECT domain, status, COUNT((domain, status))     
    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
    WHERE t1.domain != ''
        AND t1.status = 'no-capture'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25;

              domain          |   status   | count  
    --------------------------+------------+--------
     linkinghub.elsevier.com  | no-capture | 320065
     iopscience.iop.org       | no-capture |  46858
     pubs.rsc.org             | no-capture |  43331
     www.persee.fr            | no-capture |  38971
     www.doiserbia.nb.rs      | no-capture |  27112
     academic.oup.com         | no-capture |  18877
     www.osapublishing.org    | no-capture |  17113
     osf.io                   | no-capture |  16978
     scripts.iucr.org         | no-capture |  14844
     www.degruyter.com        | no-capture |   8093
     mab-online.nl            | no-capture |   6603
     insights.ovid.com        | no-capture |   6457
     ir.lib.uth.gr            | no-capture |   3625
     www.sciencedirect.com    | no-capture |   3244
     www.tandfonline.com      | no-capture |   3201
     www.ccsenet.org          | no-capture |   2849
     www.intechopen.com       | no-capture |   2813
     primary-hospital-care.ch | no-capture |   2774
     www.nature.com           | no-capture |   2484
     www.indianjournals.com   | no-capture |   2432
     journals.aps.org         | no-capture |   2197
     journals.sagepub.com     | no-capture |   2064
     www.episodes.org         | no-capture |   1805
     periodicos.uninove.br    | no-capture |   1692
     escholarship.org         | no-capture |   1666
    (25 rows)

Top "no-pdf-link" domains:

    SELECT domain, status, COUNT((domain, status))     
    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
    WHERE t1.domain != ''
        AND t1.status = 'no-pdf-link'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25;

               domain            |   status    | count  
    -----------------------------+-------------+--------
     plutof.ut.ee                | no-pdf-link | 685315
     www.gbif.org                | no-pdf-link | 670647
     doi.pangaea.de              | no-pdf-link | 301984
     www.plate-archive.org       | no-pdf-link | 209218
     onlinelibrary.wiley.com     | no-pdf-link |  84890
     figshare.com                | no-pdf-link |  72892
     zenodo.org                  | no-pdf-link |  45768
     www.tandfonline.com         | no-pdf-link |  43848
     data.mendeley.com           | no-pdf-link |  42367
     springernature.figshare.com | no-pdf-link |  35941
     dhz.uni-passau.de           | no-pdf-link |  29187
     www.frontiersin.org         | no-pdf-link |  17925
     digital.ucd.ie              | no-pdf-link |  16769
     mr.crossref.org             | no-pdf-link |  14999
     journals.lww.com            | no-pdf-link |  12122
     musewide.aip.de             | no-pdf-link |  10854
     datadryad.org               | no-pdf-link |  10686
     www.jstor.org               | no-pdf-link |   9159
     koreascience.or.kr          | no-pdf-link |   9067
     easy.dans.knaw.nl           | no-pdf-link |   8264
     scielo.conicyt.cl           | no-pdf-link |   8069
     www.degruyter.com           | no-pdf-link |   7989
     www.kci.go.kr               | no-pdf-link |   6990
     www.m-hikari.com            | no-pdf-link |   6941
     cshprotocols.cshlp.org      | no-pdf-link |   6553
    (25 rows)

Top block-ish domains:

    SELECT domain, status, COUNT((domain, status))     
    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
    WHERE t1.domain != ''
        AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status')
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25;

                 domain              |       status        | count 
    ---------------------------------+---------------------+-------
     journals.openedition.org        | redirect-loop       | 30395
     ieeexplore.ieee.org             | redirect-loop       | 28926
     www.degruyter.com               | redirect-loop       | 18891
     www.cairn.info                  | link-loop           |  8919
     www.frontiersin.org             | terminal-bad-status |  6786
     projecteuclid.org               | link-loop           |  6098
     www.mdpi.com                    | terminal-bad-status |  5189
     medicalforum.ch                 | terminal-bad-status |  4596
     jrnl.nau.edu.ua                 | link-loop           |  4238
     www.revistas.unam.mx            | link-loop           |  3926
     journals.aps.org                | redirect-loop       |  3696
     www.ijcseonline.org             | redirect-loop       |  3567
     www.researchsquare.com          | terminal-bad-status |  3453
     www.persee.fr                   | terminal-bad-status |  3221
     www.baltistica.lt               | link-loop           |  2098
     osf.io                          | redirect-loop       |  2004
     seer.ufrgs.br                   | terminal-bad-status |  2002
     jtd.amegroups.com               | link-loop           |  1738
     www.hindawi.com                 | terminal-bad-status |  1613
     linkinghub.elsevier.com         | redirect-loop       |  1612
     www.scienceopen.com             | terminal-bad-status |  1580
     atm.amegroups.com               | link-loop           |  1571
     scielo.conicyt.cl               | terminal-bad-status |  1491
     repozytorium.ur.edu.pl          | redirect-loop       |  1279
     agupubs.onlinelibrary.wiley.com | link-loop           |  1182
    (25 rows)