aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-10_daily.md
blob: d2bb50bb5153a7b78252be31f10bbcd25e082c10 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

Quick notes on how daily ingest is going, circa September/October 2020.


    SELECT ingest_request.ingest_type,
           date(ingest_request.created),
           COUNT(*) as total,
           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-changelog'
    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
    ORDER BY date(ingest_request.created) DESC;

     ingest_type |    date    | total | success
    -------------+------------+-------+---------
     pdf         | 2020-10-10 |  6145 |    1368
     pdf         | 2020-10-09 | 28453 |    6461
     pdf         | 2020-10-08 | 15105 |    3803
     pdf         | 2020-10-07 | 34213 |   10813
     pdf         | 2020-10-06 | 22263 |    8565
     pdf         | 2020-10-05 |  7910 |    3200
     pdf         | 2020-10-04 | 10865 |    4579
     pdf         | 2020-10-03 | 27745 |   10818
     pdf         | 2020-10-02 | 34320 |   13523
     pdf         | 2020-10-01 | 32548 |   13252
     pdf         | 2020-09-30 | 34798 |   14113
     pdf         | 2020-09-29 | 22463 |    8328
     pdf         | 2020-09-28 |  4117 |    1278
     pdf         | 2020-09-27 |  5894 |    1732
     pdf         | 2020-09-26 | 34949 |   13901
     pdf         | 2020-09-25 | 33680 |   10605
     pdf         | 2020-09-24 | 15125 |    5785
     pdf         | 2020-09-23 | 20866 |    6584
     pdf         | 2020-09-22 | 20949 |    7167
     pdf         | 2020-09-21 | 22483 |    7308
     pdf         | 2020-09-20 | 45644 |   16981
     pdf         | 2020-09-19 | 95571 |   31991
     pdf         | 2020-09-18 | 50849 |   15875
     pdf         | 2020-09-17 | 20121 |    3158
     pdf         | 2020-09-16 | 39184 |   12150
     pdf         | 2020-09-15 | 16986 |    7705
    (26 rows)


    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-changelog'
        GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
        ORDER BY COUNT DESC
        LIMIT 20;

     ingest_type |               status                | count
    -------------+-------------------------------------+--------
     pdf         | success                             | 241047
     pdf         | no-pdf-link                         | 143084
     pdf         | spn2-cdx-lookup-failure             | 108311
     pdf         | gateway-timeout                     |  97250
     pdf         | cdx-error                           |  61820
     pdf         | link-loop                           |  31350
     pdf         | wayback-error                       |   9139
     pdf         | spn2-error:job-failed               |   4240
     pdf         | spn2-error                          |   3893
     pdf         | wrong-mimetype                      |   1010
     pdf         | no-capture                          |    851
     pdf         | null-body                           |    605
     pdf         | redirect-loop                       |    261
     pdf         | spn2-error:soft-time-limit-exceeded |    126
     pdf         | terminal-bad-status                 |    120
     pdf         | petabox-error                       |    105
     pdf         | timeout                             |     29
     pdf         | spn2-error:no-status                |      2
     pdf         | spn2-error:invalid-server-response  |      2
     pdf         | bad-gzip-encoding                   |      1
    (20 rows)

    SELECT domain, status, COUNT((domain, status))
        FROM (
            SELECT
                ingest_file_result.ingest_type,
                ingest_file_result.status,
                substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
            FROM ingest_file_result
            LEFT JOIN ingest_request
                ON ingest_file_result.ingest_type = ingest_request.ingest_type
                AND ingest_file_result.base_url = ingest_request.base_url
            WHERE
                -- ingest_request.created >= NOW() - '3 day'::INTERVAL
                ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
                AND ingest_request.ingest_type = 'pdf'
                AND ingest_request.ingest_request_source = 'fatcat-changelog'
        ) t1
        WHERE t1.domain != ''
            AND t1.status != 'success'
        GROUP BY domain, status
        ORDER BY COUNT DESC
        LIMIT 25; 


                domain            |         status          | count
    ------------------------------+-------------------------+-------
     zenodo.org                   | no-pdf-link             | 52767
     www.degruyter.com            | link-loop               | 17666
     www.degruyter.com            | spn2-cdx-lookup-failure | 17597
     ieeexplore.ieee.org          | gateway-timeout         | 15290
     www.sciencedirect.com        | no-pdf-link             | 14043
     apps.crossref.org            | no-pdf-link             | 11531
     figshare.com                 | no-pdf-link             |  8966
     tandf.figshare.com           | no-pdf-link             |  7276
     zenodo.org                   | no-capture              |  7191
     springernature.figshare.com  | no-pdf-link             |  6485
     www.taylorfrancis.com        | link-loop               |  6266
     www.persee.fr                | terminal-bad-status     |  6031
     journals.openedition.org     | gateway-timeout         |  5639
     www.cairn.info               | link-loop               |  5618
     archaeologydataservice.ac.uk | no-pdf-link             |  5359
     www.taylorfrancis.com        | spn2-cdx-lookup-failure |  4748
     www.e-periodica.ch           | no-pdf-link             |  4722
     osf.io                       | no-capture              |  4247
     cancerres.aacrjournals.org   | no-pdf-link             |  4136
     dlc.library.columbia.edu     | no-pdf-link             |  4085
     www.egms.de                  | no-pdf-link             |  3304
     journals.lww.com             | no-pdf-link             |  3218
     journals.plos.org            | no-pdf-link             |  3005
     linkinghub.elsevier.com      | gateway-timeout         |  2833
     www.egms.de                  | redirect-loop           |  2606
    (25 rows)

    SELECT domain, status, COUNT((domain, status))
    FROM (
        SELECT
            ingest_file_result.ingest_type,
            ingest_file_result.status,
            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE
            -- ingest_request.created >= NOW() - '3 day'::INTERVAL
            ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-changelog'
    ) t1
    WHERE t1.domain != ''
        AND t1.status = 'success'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25; 

                    domain                | status  | count
    --------------------------------------+---------+-------
     zenodo.org                           | success | 55549
     arxiv.org                            | success | 24450
     s3-eu-west-1.amazonaws.com           | success | 18156
     res.mdpi.com                         | success | 13493
     www.degruyter.com                    | success | 12009
     journals.openedition.org             | success | 11235
     www.jstage.jst.go.jp                 | success |  9460
     peer.asee.org                        | success |  9416
     www.e-periodica.ch                   | success |  8105
     ir.canterbury.ac.nz                  | success |  6381
     europepmc.org                        | success |  5670
     www.repository.cam.ac.uk             | success |  4858
     assets.researchsquare.com            | success |  4765
     fjfsdata01prod.blob.core.windows.net | success |  4130
     tidsskrift.dk                        | success |  3964
     research-journal.org                 | success |  3127
     ieeexplore.ieee.org                  | success |  2947
     dergipark.org.tr                     | success |  2892
     watermark.silverchair.com            | success |  2315
     journals.plos.org                    | success |  2304
     journal.fi                           | success |  1996
     publications.rwth-aachen.de          | success |  1954
     www.brazilianjournals.com            | success |  1637
     article.sciencepublishinggroup.com   | success |  1589
     revistas.upr.edu                     | success |  1467
    (25 rows)

Casual take-aways:
- wonder what `apps.crossref.org` is
- sciencedirect crawling broken?
- figshare might be broken? or just very little success
- seems like a lot of journals.plos.org failures