aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2021-09-03_patch_crawl.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2021-09-03_patch_crawl.md')
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md92
1 files changed, 89 insertions, 3 deletions
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
index f63e524..ad69311 100644
--- a/notes/ingest/2021-09-03_patch_crawl.md
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -482,7 +482,93 @@ Note that this is just seedlists, not full ingest requests.
Then run the actual patch crawl!
-## Ingest Requests for Bulk Retry
+## Ingest Requests for Bulk Retry (2022-01-06)
-TODO: for each of the link sources mentioned at top, do a separate query by
-source to re-ingest.
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Stats Again
+
+Re-run the "progress check" stuff from above