From ffd6cd86bb8a4756d123decaa5f2ef03428f208f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 2 Sep 2021 16:31:23 -0700 Subject: MAG post-crawl stats (5m+ new PDFs crawled successfully) --- notes/ingest/2021-08_mag.md | 124 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) (limited to 'notes') diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md index 5bab4f0..5f92196 100644 --- a/notes/ingest/2021-08_mag.md +++ b/notes/ingest/2021-08_mag.md @@ -274,3 +274,127 @@ And actually dump seedlist(s): 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + +## Post-Crawl Bulk Re-Ingest + +Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by +hash, URL agnostic). + +Enqueue for buik re-ingest: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => Thu 19 Aug 2021 09:10:59 PM UTC + + +## Post-Ingest Stats + +Just the new stuff (compare against above for delta): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 7748241 89.2% + no-capture | 429688 4.9% + redirect-loop | 172831 2.0% + terminal-bad-status | 94029 1.1% + no-pdf-link | 86437 1.0% + blocked-cookie | 67903 0.8% + link-loop | 50622 + wrong-mimetype | 21064 + null-body | 6650 + cdx-error | 3313 + wayback-error | 2630 + gateway-timeout | 399 + petabox-error | 268 + wayback-content-error | 170 + not-found | 130 + read-timeout | 128 + | 109 + invalid-host-resolution | 63 + bad-redirect | 39 + spn2-error | 20 + (20 rows) + +New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397 + +Overall success of new batch: 7748241. / 8686315 = 89.2% + +And combined (old and new) status again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 31990062 + redirect-loop | 1704717 + no-capture | 1263462 + link-loop | 1218280 + blocked-cookie | 1213838 + no-pdf-link | 1096664 + terminal-bad-status | 960070 + gateway-timeout | 202190 + wrong-mimetype | 86557 + invalid-host-resolution | 37262 + null-body | 15443 + wayback-error | 12839 + cdx-error | 4047 + spn2-error | 1731 + spn2-error:job-failed | 962 + petabox-error | 463 + wayback-content-error | 379 + spn2-error:invalid-url-syntax | 336 + spn2-error:soft-time-limit-exceeded | 203 + | 175 + (20 rows) + +New success total: 31990062 - 26123975 = 5,866,087 + +A full 1,263,462 no-capture that could be attempted... though many of those may +be excluded for a specific reason. -- cgit v1.2.3