aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-15 12:42:42 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-15 13:39:28 -0700
commited27bf6fb7afda158812c0822498bc7408572b8e (patch)
treea7a98ac1e9546b0360e4fdf58dfa4f0e4e626745 /notes/ingest
parentd11879e1c75a8fb1882dbb23533a458619185a9a (diff)
downloadsandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.tar.gz
sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.zip
COVID-19 chinese paper ingest
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2020-04-13_covid19.md73
1 files changed, 73 insertions, 0 deletions
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md
new file mode 100644
index 0000000..b442d69
--- /dev/null
+++ b/notes/ingest/2020-04-13_covid19.md
@@ -0,0 +1,73 @@
+
+Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
+
+Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
+They are heterogenous after redirect.
+
+CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
+info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
+
+Need scope and identifiers for ingest requests. Let's do:
+
+ cnki_covid19 / <ident>
+ wanfang_covid19 / <ident>
+
+Source: scrape-covid19
+
+## Commands
+
+ # in sandcrawler pipenv
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
+
+
+ cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
+ cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
+
+## Status
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+2020-04-15:
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | spn2-cdx-lookup-failure | 1588
+ pdf | success | 671
+ pdf | gateway-timeout | 507
+ pdf | no-pdf-link | 181
+ pdf | wayback-error | 30
+ pdf | spn2-error:job-failed | 20
+ pdf | spn2-error | 7
+ pdf | spn2-error:soft-time-limit-exceeded | 3
+ pdf | spn2-error:pending | 2
+ (9 rows)
+
+## Re-Try
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status != 'no-pdf-link'
+ AND ingest_file_result.status != 'link-loop'
+ ) TO '/grande/snapshots/reingest_covid19.rows.json';
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
+
+ cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
+