1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
heritrix bulk crawling, along with JALC and DOAJ URLs.
export SNAPSHOT=2022-07-20
## Transform and Load
# on sandcrawler-vm
mkdir -p /srv/sandcrawler/tasks/doaj
cd /srv/sandcrawler/tasks/doaj
wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
# in pipenv, in python directory
zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
# 9.72M 0:36:28 [4.44k/s]
zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
# 9.72M 0:17:04 [9.49k/s]
# Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
# JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
Stats after this load:
SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
ingest_request.link_source = 'doaj'
GROUP BY ingest_request.ingest_type, status
-- next time include ingest_type in sort
ORDER BY COUNT DESC
LIMIT 30;
ingest_type | status | count
-------------+--------------------------+---------
pdf | success | 3165539
pdf | | 2078874
html | | 1547698
html | wrong-scope | 1114332
pdf | no-pdf-link | 517261
html | success | 388376
html | unknown-scope | 242044
pdf | no-capture | 179030
pdf | terminal-bad-status | 174741
html | no-capture | 155323
pdf | null-body | 129267
pdf | redirect-loop | 127136
html | html-resource-no-capture | 117275
html | null-body | 100296
pdf | blocked-cookie | 71093
html | redirect-loop | 65519
html | terminal-bad-status | 64856
html | blocked-cookie | 64095
html | spn2-backoff | 55173
pdf | link-loop | 27440
html | wrong-mimetype | 26016
html | wayback-content-error | 20109
xml | | 13624
pdf | wrong-mimetype | 8411
xml | success | 6899
html | petabox-error | 6199
html | wayback-error | 5269
html | spn2-cdx-lookup-failure | 4635
html | spn2-recent-capture | 4527
xml | null-body | 2353
(30 rows)
## Bulk Ingest
COPY (
SELECT row_to_json(t1.*)
FROM (
SELECT ingest_request.*, ingest_file_result as result
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_file_result.base_url = ingest_request.base_url
AND ingest_file_result.ingest_type = ingest_request.ingest_type
WHERE
ingest_request.link_source = 'doaj'
-- AND (ingest_request.ingest_type = 'pdf'
-- OR ingest_request.ingest_type = 'xml')
AND (
ingest_file_result.status IS NULL
OR ingest_file_result.status = 'no-capture'
)
AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
AND ingest_request.base_url NOT LIKE '%://archive.org/%'
AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
-- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
-- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
-- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
-- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
-- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
-- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
-- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
-- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
-- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
-- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
) t1
) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
# COPY 3962331
Transform:
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
# 3.96M 0:01:47 [36.7k/s]
Top domains:
cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
789988 www.mdpi.com
318142 www.frontiersin.org
226316 link.springer.com
204429 www.scielo.br
201175 www.sciencedirect.com
72852 ieeexplore.ieee.org
68983 dx.doi.org
33286 www.dovepress.com
26020 elifesciences.org
23838 www.cetjournal.it
21102 mab-online.nl
20242 www.revistas.usp.br
16564 periodicos.uem.br
15710 journals.openedition.org
14514 dergipark.org.tr
14072 apcz.umk.pl
13924 ojs.minions.amsterdam
13717 bmgn-lchr.nl
13512 ojstest.minions.amsterdam
10440 journals.asm.org
Bulk ingest:
cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
# Done
## Stats Again
SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
ingest_request.link_source = 'doaj'
GROUP BY ingest_request.ingest_type, status
-- ORDER BY ingest_request.ingest_type, COUNT DESC
ORDER BY COUNT DESC
LIMIT 30;
ingest_type | status | count
-------------+--------------------------+---------
pdf | success | 4704006
html | wrong-scope | 1761227
html | success | 778165
pdf | no-pdf-link | 759805
html | no-capture | 382080
html | unknown-scope | 313391
html | html-resource-no-capture | 292953
pdf | no-capture | 290311
pdf | terminal-bad-status | 271776
pdf | null-body | 129267
pdf | blocked-cookie | 108491
html | terminal-bad-status | 103014
html | null-body | 100296
html | blocked-cookie | 88533
pdf | | 81517
pdf | skip-url-blocklist | 76443
html | spn2-backoff | 50615
pdf | link-loop | 45516
html | wrong-mimetype | 33525
html | wayback-content-error | 25535
pdf | empty-blob | 21431
pdf | redirect-loop | 19795
html | petabox-error | 18291
html | empty-blob | 14391
pdf | wrong-mimetype | 14084
html | redirect-loop | 12856
xml | success | 10381
xml | no-capture | 10008
html | skip-url-blocklist | 3294
html | cdx-error | 3275
(30 rows)
Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
PDFs with no attempt at all? Maybe a filter, or bogus URLs.
Over 1.5M new PDF success over this crawl iteration period, nice.
|