1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
Quick notes on how daily ingest is going, circa September/October 2020.
SELECT ingest_request.ingest_type,
date(ingest_request.created),
COUNT(*) as total,
COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
ORDER BY date(ingest_request.created) DESC;
ingest_type | date | total | success
-------------+------------+-------+---------
pdf | 2020-10-10 | 6145 | 1368
pdf | 2020-10-09 | 28453 | 6461
pdf | 2020-10-08 | 15105 | 3803
pdf | 2020-10-07 | 34213 | 10813
pdf | 2020-10-06 | 22263 | 8565
pdf | 2020-10-05 | 7910 | 3200
pdf | 2020-10-04 | 10865 | 4579
pdf | 2020-10-03 | 27745 | 10818
pdf | 2020-10-02 | 34320 | 13523
pdf | 2020-10-01 | 32548 | 13252
pdf | 2020-09-30 | 34798 | 14113
pdf | 2020-09-29 | 22463 | 8328
pdf | 2020-09-28 | 4117 | 1278
pdf | 2020-09-27 | 5894 | 1732
pdf | 2020-09-26 | 34949 | 13901
pdf | 2020-09-25 | 33680 | 10605
pdf | 2020-09-24 | 15125 | 5785
pdf | 2020-09-23 | 20866 | 6584
pdf | 2020-09-22 | 20949 | 7167
pdf | 2020-09-21 | 22483 | 7308
pdf | 2020-09-20 | 45644 | 16981
pdf | 2020-09-19 | 95571 | 31991
pdf | 2020-09-18 | 50849 | 15875
pdf | 2020-09-17 | 20121 | 3158
pdf | 2020-09-16 | 39184 | 12150
pdf | 2020-09-15 | 16986 | 7705
(26 rows)
SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
ORDER BY COUNT DESC
LIMIT 20;
ingest_type | status | count
-------------+-------------------------------------+--------
pdf | success | 241047
pdf | no-pdf-link | 143084
pdf | spn2-cdx-lookup-failure | 108311
pdf | gateway-timeout | 97250
pdf | cdx-error | 61820
pdf | link-loop | 31350
pdf | wayback-error | 9139
pdf | spn2-error:job-failed | 4240
pdf | spn2-error | 3893
pdf | wrong-mimetype | 1010
pdf | no-capture | 851
pdf | null-body | 605
pdf | redirect-loop | 261
pdf | spn2-error:soft-time-limit-exceeded | 126
pdf | terminal-bad-status | 120
pdf | petabox-error | 105
pdf | timeout | 29
pdf | spn2-error:no-status | 2
pdf | spn2-error:invalid-server-response | 2
pdf | bad-gzip-encoding | 1
(20 rows)
SELECT domain, status, COUNT((domain, status))
FROM (
SELECT
ingest_file_result.ingest_type,
ingest_file_result.status,
substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_request.created >= NOW() - '3 day'::INTERVAL
ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
) t1
WHERE t1.domain != ''
AND t1.status != 'success'
GROUP BY domain, status
ORDER BY COUNT DESC
LIMIT 25;
domain | status | count
------------------------------+-------------------------+-------
zenodo.org | no-pdf-link | 52767
www.degruyter.com | link-loop | 17666
www.degruyter.com | spn2-cdx-lookup-failure | 17597
ieeexplore.ieee.org | gateway-timeout | 15290
www.sciencedirect.com | no-pdf-link | 14043
apps.crossref.org | no-pdf-link | 11531
figshare.com | no-pdf-link | 8966
tandf.figshare.com | no-pdf-link | 7276
zenodo.org | no-capture | 7191
springernature.figshare.com | no-pdf-link | 6485
www.taylorfrancis.com | link-loop | 6266
www.persee.fr | terminal-bad-status | 6031
journals.openedition.org | gateway-timeout | 5639
www.cairn.info | link-loop | 5618
archaeologydataservice.ac.uk | no-pdf-link | 5359
www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748
www.e-periodica.ch | no-pdf-link | 4722
osf.io | no-capture | 4247
cancerres.aacrjournals.org | no-pdf-link | 4136
dlc.library.columbia.edu | no-pdf-link | 4085
www.egms.de | no-pdf-link | 3304
journals.lww.com | no-pdf-link | 3218
journals.plos.org | no-pdf-link | 3005
linkinghub.elsevier.com | gateway-timeout | 2833
www.egms.de | redirect-loop | 2606
(25 rows)
SELECT domain, status, COUNT((domain, status))
FROM (
SELECT
ingest_file_result.ingest_type,
ingest_file_result.status,
substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_request.created >= NOW() - '3 day'::INTERVAL
ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
) t1
WHERE t1.domain != ''
AND t1.status = 'success'
GROUP BY domain, status
ORDER BY COUNT DESC
LIMIT 25;
domain | status | count
--------------------------------------+---------+-------
zenodo.org | success | 55549
arxiv.org | success | 24450
s3-eu-west-1.amazonaws.com | success | 18156
res.mdpi.com | success | 13493
www.degruyter.com | success | 12009
journals.openedition.org | success | 11235
www.jstage.jst.go.jp | success | 9460
peer.asee.org | success | 9416
www.e-periodica.ch | success | 8105
ir.canterbury.ac.nz | success | 6381
europepmc.org | success | 5670
www.repository.cam.ac.uk | success | 4858
assets.researchsquare.com | success | 4765
fjfsdata01prod.blob.core.windows.net | success | 4130
tidsskrift.dk | success | 3964
research-journal.org | success | 3127
ieeexplore.ieee.org | success | 2947
dergipark.org.tr | success | 2892
watermark.silverchair.com | success | 2315
journals.plos.org | success | 2304
journal.fi | success | 1996
publications.rwth-aachen.de | success | 1954
www.brazilianjournals.com | success | 1637
article.sciencepublishinggroup.com | success | 1589
revistas.upr.edu | success | 1467
(25 rows)
Casual take-aways:
- wonder what `apps.crossref.org` is
- sciencedirect crawling broken?
- figshare might be broken? or just very little success
- seems like a lot of journals.plos.org failures
|