blob: b442d690562839703577480e95e57a57b230c248 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
They are heterogenous after redirect.
CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
Need scope and identifiers for ingest requests. Let's do:
cnki_covid19 / <ident>
wanfang_covid19 / <ident>
Source: scrape-covid19
## Commands
# in sandcrawler pipenv
cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
## Status
SELECT ingest_request.ingest_type,
ingest_file_result.status,
COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'scrape-covid19'
GROUP BY ingest_request.ingest_type, ingest_file_result.status
ORDER BY COUNT(*) DESC;
2020-04-15:
ingest_type | status | count
-------------+-------------------------------------+-------
pdf | spn2-cdx-lookup-failure | 1588
pdf | success | 671
pdf | gateway-timeout | 507
pdf | no-pdf-link | 181
pdf | wayback-error | 30
pdf | spn2-error:job-failed | 20
pdf | spn2-error | 7
pdf | spn2-error:soft-time-limit-exceeded | 3
pdf | spn2-error:pending | 2
(9 rows)
## Re-Try
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'scrape-covid19'
AND ingest_file_result.ingest_type = 'pdf'
AND ingest_file_result.hit = false
AND ingest_file_result.status != 'no-pdf-link'
AND ingest_file_result.status != 'link-loop'
) TO '/grande/snapshots/reingest_covid19.rows.json';
./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
|