From 2469d483326e5e81e774e46fd100888710b9bbc3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 11 Oct 2020 21:43:23 -0700 Subject: start 2020-10 ingest notes --- notes/ingest/2020-10_unpaywall.md | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 notes/ingest/2020-10_unpaywall.md (limited to 'notes') diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md new file mode 100644 index 0000000..d474176 --- /dev/null +++ b/notes/ingest/2020-10_unpaywall.md @@ -0,0 +1,42 @@ + +New snapshot released 2020-10-09. Want to do a mostly straight-forward +load/ingest/crawl. + +Proposed changes this time around: + +- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture` + status, and to include those URLs in heritrix3 crawl +- tweak heritrix3 config for additional PDF URL extraction patterns, + particularly to improve OJS yield + + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json + => 28.3M 3:19:03 [2.37k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 28.3M 1:11:29 [ 6.6k/s] + => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2020-10-09' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + + -- cgit v1.2.3