From 5dd8785d710cf7d067afdc691069bfa74406e06a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 May 2020 14:47:17 -0700 Subject: ingests: normalize file names; commit updates --- notes/ingest/2020-04-07_unpaywall.md | 63 ------------------------------------ 1 file changed, 63 deletions(-) delete mode 100644 notes/ingest/2020-04-07_unpaywall.md (limited to 'notes/ingest/2020-04-07_unpaywall.md') diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md deleted file mode 100644 index e30d482..0000000 --- a/notes/ingest/2020-04-07_unpaywall.md +++ /dev/null @@ -1,63 +0,0 @@ - -A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but -not released for more than a month). - -Primary goal is: - -- generate ingest requests for only *new* URLs -- bulk ingest these new URLs -- crawl any no-capture URLs from that batch -- re-bulk-ingest the no-capture batch -- analytics on failed ingests. eg, any particular domains that are failing to crawl - -This ingest pipeline was started on 2020-04-07 by bnewbold. - -## Transform and Load - - # in sandcrawler pipenv on aitio - zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json - => 24.7M 5:17:03 [ 1.3k/s] - - cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - - => 24.7M - => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) - -## Dump new URLs and Bulk Ingest - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status IS NULL - ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; - => 3696189 - - cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Dump no-capture - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status = 'no-capture' - AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' - AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' - AND ingest_request.base_url NOT LIKE '%ahajournals.org%' - AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' - AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' - AND ingest_request.base_url NOT LIKE '%academic.oup.com%' - AND ingest_request.base_url NOT LIKE '%tandfonline.com%' - ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json'; -- cgit v1.2.3