more patch crawling

author: Bryan Newbold <bnewbold@archive.org> 2022-02-08 17:49:39 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2022-02-08 17:49:50 -0800
commit: 3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd (patch)
tree: 077afcd3c48553dbc65760db047b2e81ba080a73 /notes
parent: 067c97a59a4a8728add7b9e561082a5403be52e5 (diff)
download: sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.tar.gz
sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.zip
2 files changed, 209 insertions, 9 deletions
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
index ffd6669..bc1d4d5 100644
--- a/notes/ingest/2022-01-06_patch_crawl.md
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -21,7 +21,7 @@ TODO: html-resource-no-capture (from error message? or do SPN requests separatel
 
 Dump terminal URLs (will do ingest requests later, using similar command):
 
-    COPY (  
+    COPY (
         SELECT ingest_file_result.terminal_url
         -- SELECT row_to_json(ingest_request.*)
         FROM ingest_request
@@ -154,3 +154,139 @@ TODO: filter out archive.org/www.archive.org
 TODO: cleanup ingest request table in sandcrawler-db:
 - remove filtered OAI-PMH prefixes
 - remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+    COPY (
+        SELECT ingest_file_result.terminal_url
+        -- SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            (
+                ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'html'
+            )
+            AND ingest_file_result.updated >= '2022-01-12'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+                OR (
+                    ingest_file_result.status = 'terminal-bad-status'
+                    AND (
+                        ingest_file_result.terminal_status_code = 429
+                        OR ingest_file_result.terminal_status_code = 500
+                        OR ingest_file_result.terminal_status_code = 502
+                        OR ingest_file_result.terminal_status_code = 503
+                    )
+                )
+            )
+            AND (
+                ingest_request.link_source = 'oai'
+                OR ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'arxiv'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+                OR ingest_request.link_source = 'pmc'
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+    -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+    ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+    => COPY 444764
+
+    cat patch_terminal_url.2022-02-08.txt \
+        | rg -v www.archive.org \
+        | rg '://' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | rg -i '^http' \
+        | sort -u -S 4G \
+        | pv -l \
+        > patch_terminal_url.2022-02-08.uniq.txt
+    => 426k 0:00:04 [ 103k/s]
+
+    cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+      60123 www.degruyter.com
+      59314 arxiv.org
+      43674 zenodo.org
+      17771 doi.org
+       9501 linkinghub.elsevier.com
+       9379 www.mdpi.com
+       5691 opendata.uni-halle.de
+       5578 scholarlypublishingcollective.org
+       5451 era.library.ualberta.ca
+       4982 www.cairn.info
+       4306 www.taylorfrancis.com
+       4189 papers.ssrn.com
+       4157 apps.crossref.org
+       4089 www.sciencedirect.com
+       4033 mdpi-res.com
+       3763 dlc.mpg.de
+       3408 osf.io
+       2603 www.frontiersin.org
+       2594 watermark.silverchair.com
+       2569 journals.lww.com
+       1787 underline.io
+       1680 archiviostorico.fondazione1563.it
+       1658 www.jstage.jst.go.jp
+       1611 cyberleninka.ru
+       1535 www.schoeningh.de
+
+    cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+    => Done
+
+Copied to crawler svc206 and added to frontier.
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
index 6f3b2c8..09a3b46 100644
--- a/notes/ingest/2022-01-13_doi_crawl.md
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -1,6 +1,8 @@
 
 Could roll this in to current patch crawl instead of starting a new crawl from scratch.
 
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
 ## KBART "almost complete" experimentation
 
 Random 10 releases:
@@ -133,15 +135,12 @@ many of these are likely to crawl successfully.
         | pv -l \
         | gzip \
         > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
-    # Expecting 8255693 release objects in search queries
-
-## Seeds: not daily, but OA DOI
+    # re-running 2022-02-08 after this VM was upgraded
+    # Expecting 8321448 release objects in search queries
+    # TODO: in-progress
 
-There are a bunch of things we are no longer attempting daily, but should do
-heritrix crawls of periodically.
-
-TODO: maybe in daily crawling, should check container coverage and see if most URLs are bright, and if so do ingest? hrm
-TODO: What are they? zenodo.org?
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
 
 ## Seeds: HTML and XML links from HTML biblio
 
@@ -152,6 +151,71 @@ TODO: What are they? zenodo.org?
         | gzip \
         > ingest_file_result_fulltext_urls.2022-01-13.json.gz
 
+    # cut this off at some point? gzip is terminated weird
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+    # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+    # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+        | jq .html_biblio.xml_fulltext_url -r \
+        | rg '://' \
+        | sort -u -S 4G \
+        | pv -l \
+        | gzip \
+        > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+    # 1.24M 0:01:35 [12.9k/s]
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+        | jq .html_biblio.html_fulltext_url -r \
+        | rg '://' \
+        | sort -u -S 4G \
+        | pv -l \
+        | gzip \
+        > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+    # 549k 0:01:27 [6.31k/s]
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+        | cut -f3 -d/ \
+        | sort -S 4G \
+        | uniq -c \
+        | sort -nr \
+        | head -n20
+
+     534005 dlc.library.columbia.edu
+     355319 www.degruyter.com
+     196421 zenodo.org
+     101450 serval.unil.ch
+     100631 biblio.ugent.be
+      47986 digi.ub.uni-heidelberg.de
+      39187 www.emerald.com
+      33195 www.cairn.info
+      25703 boris.unibe.ch
+      19516 journals.openedition.org
+      15911 academic.oup.com
+      11091 repository.dl.itc.u-tokyo.ac.jp
+       9847 oxfordworldsclassics.com
+       9698 www.thieme-connect.de
+       9552 www.idunn.no
+       9265 www.zora.uzh.ch
+       8030 www.scielo.br
+       6543 www.hanspub.org
+       6229 asmedigitalcollection.asme.org
+       5651 brill.com
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+        | awk '{print "F+ " $1}' \
+        > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+    wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+    1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
 ## Seeds: most doi.org terminal non-success
 
 Unless it is a 404, should retry.
+
+TODO: generate this list
author	Bryan Newbold <bnewbold@archive.org>	2022-02-08 17:49:39 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2022-02-08 17:49:50 -0800
commit	3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd (patch)
tree	077afcd3c48553dbc65760db047b2e81ba080a73 /notes
parent	067c97a59a4a8728add7b9e561082a5403be52e5 (diff)
download	sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.tar.gz sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.zip