From 4ee8e9364a99d02e22f295bdcf80aafce1ffc03f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Jan 2021 18:29:38 -0800 Subject: late-2020 OA DOI crawl ingest notes --- notes/ingest/2020-09_oa_doi.md | 49 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) (limited to 'notes') diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md index 2b4cb57..f5c853d 100644 --- a/notes/ingest/2020-09_oa_doi.md +++ b/notes/ingest/2020-09_oa_doi.md @@ -196,8 +196,6 @@ Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 -TODO below here... - Resuming progress on this in early December 2020. Filtered requests to re-crawl: @@ -306,4 +304,49 @@ TODO: infer `publisher_type` and platform from DOI prefix in more cases ## Re-Ingest -TODO after crawl completion +Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3 +million requests. Note these are all `pdf` requests, but crawl was done in an +HTML-friendly way, so should be able to do domain/journal-specific HTML ingests +in the future. + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Stats, for this ingest period (fuzzy; will have some daily ingest stuff): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-12-28' + AND ingest_request.created <= '2020-12-09' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -----------------------+-------- + no-pdf-link | 962714 + success | 539305 + no-capture | 306590 + redirect-loop | 192149 + link-loop | 184797 + terminal-bad-status | 141721 + wrong-mimetype | 10362 + null-body | 10277 + skip-url-blocklist | 1985 + wayback-content-error | 1300 + cdx-error | 869 + petabox-error | 160 + bad-redirect | 72 + wayback-error | 46 + bad-gzip-encoding | 7 + timeout | 1 + max-hops-exceeded | 1 + (17 rows) + -- cgit v1.2.3