From 4aa53c730f5fe9e0a5249d10d0b6ac0ff3c0db7c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 2 Mar 2022 17:09:44 -0800
Subject: 2022 patch crawl bulk ingest notes

---
 notes/ingest/2022-01-06_patch_crawl.md | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'notes/ingest')

diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
index bc1d4d5..941519f 100644
--- a/notes/ingest/2022-01-06_patch_crawl.md
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -290,3 +290,109 @@ TODO: cleanup ingest request table in sandcrawler-db:
     => Done
 
 Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+    COPY (
+        -- SELECT ingest_file_result.terminal_url
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            (
+                ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'html'
+            )
+            AND ingest_file_result.updated <= '2022-02-08'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+                OR (
+                    ingest_file_result.status = 'terminal-bad-status'
+                    AND (
+                        ingest_file_result.terminal_status_code = 429
+                        OR ingest_file_result.terminal_status_code = 500
+                        OR ingest_file_result.terminal_status_code = 502
+                        OR ingest_file_result.terminal_status_code = 503
+                    )
+                )
+            )
+            AND (
+                -- ingest_request.link_source = 'oai'
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'arxiv'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+                OR ingest_request.link_source = 'pmc'
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+    # COPY 3053219
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+    => DONE
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => DONE
+
-- 
cgit v1.2.3