diff options
Diffstat (limited to 'notes/ingest/2021-09-03_patch_crawl.md')
-rw-r--r-- | notes/ingest/2021-09-03_patch_crawl.md | 196 |
1 files changed, 193 insertions, 3 deletions
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md index f63e524..d36f427 100644 --- a/notes/ingest/2021-09-03_patch_crawl.md +++ b/notes/ingest/2021-09-03_patch_crawl.md @@ -482,7 +482,197 @@ Note that this is just seedlists, not full ingest requests. Then run the actual patch crawl! -## Ingest Requests for Bulk Retry +## Ingest Requests for Bulk Retry (2022-01-06) + +Crawl has just about completed, so running another round of bulk ingest +requests, slightly updated to allow `https://doi.org/10*` in terminal URL: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.updated <= '2022-01-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json'; + => 4,488,193 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json + => DONE + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => TIMEDOUT + => (probably due to re-assignment) + => DONE + +## Stats Again (just OAI-PMH) + +OAI-PMH query: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +On 2022-02-08: + + status | count + -----------------------+---------- + success | 13505143 + no-pdf-link | 8741007 + no-capture | 4429986 + redirect-loop | 1566611 + terminal-bad-status | 816162 + link-loop | 459006 + wrong-mimetype | 448983 + null-body | 71871 + cdx-error | 19055 + | 15275 + petabox-error | 11713 + blocked-cookie | 11664 + wayback-error | 8745 + skip-url-blocklist | 7828 + max-hops-exceeded | 2031 + wayback-content-error | 338 + body-too-large | 280 + spn2-error:job-failed | 191 + bad-redirect | 134 + redirects-exceeded | 120 + (20 rows) + + +On 2022-02-28, after bulk ingest completed: + + status | count + -----------------------+---------- + success | 14668123 + no-pdf-link | 8822460 + no-capture | 2987565 + redirect-loop | 1629015 + terminal-bad-status | 917851 + wrong-mimetype | 466512 + link-loop | 460941 + null-body | 71457 + cdx-error | 19636 + petabox-error | 16198 + | 15275 + blocked-cookie | 11885 + wayback-error | 8779 + skip-url-blocklist | 7838 + empty-blob | 5906 + max-hops-exceeded | 5563 + wayback-content-error | 355 + body-too-large | 329 + spn2-error:job-failed | 191 + bad-redirect | 137 + (20 rows) + + +Comparing to a couple months ago: + + 14668123-13258356 = +1,409,767 success + 8822460-8685519 = + 136,941 no-pdf-link + 2987565-4765663 = -1,778,098 no-capture + 917851-803373 = + 114,478 terminal-bad-status -TODO: for each of the link sources mentioned at top, do a separate query by -source to re-ingest. |