blob: b749557b8d5d43c2e4282c6d9d318f9e32a593bd (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
## re-ingest some broken
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'spn2-%'
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'cdx-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'cdx-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source != 'fatcat-changelog'
AND ingest_request.ingest_request_source != 'fatcat-ingest')
) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'wayback-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'gateway-timeout'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status like 'petabox-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
Transform:
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
Push to kafka (shuffled):
cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0
cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
Push to kafka (not shuffled):
cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
## just recent fatcat-ingest
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
-- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL
AND ingest_file_result.hit = false
AND (ingest_file_result.status like 'spn2-%'
OR ingest_file_result.status like 'cdx-error'
OR ingest_file_result.status like 'gateway-timeout'
OR ingest_file_result.status like 'wayback-error'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
# note: shuf
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
## specific domains
protocols.io:
SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_request.base_url LIKE '%10.17504/protocols.io%'
GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
ORDER BY COUNT DESC
LIMIT 20;
biorxiv/medrxiv:
SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_request.base_url LIKE '%10.1101/20%'
GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
ORDER BY COUNT DESC
LIMIT 20;
|