start notes on unpaywall and targeted/patch crawls

author: Bryan Newbold <bnewbold@archive.org> 2022-04-20 16:32:43 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-04-20 16:32:43 -0700
commit: 47a064c2cc10874aed3a3de7160c92d51039a2a8 (patch)
tree: cd4098b6e4746027a3fa8476ec9c43819ec396e4
parent: b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7 (diff)
download: sandcrawler-47a064c2cc10874aed3a3de7160c92d51039a2a8.tar.gz
sandcrawler-47a064c2cc10874aed3a3de7160c92d51039a2a8.zip
2 files changed, 277 insertions, 0 deletions
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..89fe40a
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,142 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+    export PATCHDATE=2022-04-20
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+    COPY (
+        SELECT row_to_json(t) FROM (
+            SELECT ingest_file_result.terminal_url, ingest_request.*
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                (
+                    ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'html'
+                )
+                -- AND ingest_file_result.updated >= '2022-01-12'
+                AND (
+                    ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'wayback-content-error'
+                    OR ingest_file_result.status = 'petabox-error'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR (
+                        ingest_file_result.status = 'terminal-bad-status'
+                        AND (
+                            ingest_file_result.terminal_status_code = 429
+                            OR ingest_file_result.terminal_status_code = 500
+                            OR ingest_file_result.terminal_status_code = 502
+                            OR ingest_file_result.terminal_status_code = 503
+                        )
+                    )
+                )
+                AND (
+                    ingest_request.link_source = 'doi'
+                    OR ingest_request.link_source = 'arxiv'
+                    OR ingest_request.link_source = 'doaj'
+                    OR ingest_request.link_source = 'dblp'
+                    OR ingest_request.link_source = 'pmc'
+                    -- OR ingest_request.link_source = 'unpaywall'
+                    -- OR ingest_request.link_source = 'oai'
+                )
+
+                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+        ) t
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+    # COPY 4842749
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+        | rg -v "\\\\" \
+        | jq -r .terminal_url \
+        | rg '://' \
+        | rg -i '^http' \
+        | rg -v www.archive.org \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+    # 4.75M 0:01:44 [45.4k/s]
+
+    # check top domains
+    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+    1515829 www.jstage.jst.go.jp
+    1052953 doi.org
+     241704 arxiv.org
+     219543 www.sciencedirect.com
+     178562 www.persee.fr
+      84947 zenodo.org
+      67397 www.mdpi.com
+      65775 journals.lww.com
+      58216 opg.optica.org
+      50673 osf.io
+      45776 www.degruyter.com
+      36664 www.indianjournals.com
+      35287 pubs.rsc.org
+      33495 www.bmj.com
+      33320 www.research-collection.ethz.ch
+      29728 www.e-periodica.ch
+      28338 iopscience.iop.org
+      26364 www.cambridge.org
+      23840 onlinelibrary.wiley.com
+      23641 platform.almanhal.com
+      22660 brill.com
+      20288 www.osapublishing.org
+      18561 cgscholar.com
+      18539 doi.nrct.go.th
+      15677 www.frontiersin.org
+
+    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+    cd /srv/sandcrawler/src/python
+    sudo su sandcrawler
+    pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..600b2d6
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,135 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+## Download and Archive
+
+    wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+    # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+    export SNAPSHOT=2022-03-09
+    ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+    # if needed
+    scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+    # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+    cd /srv/sandcrawler/src/python
+    sudo su sandcrawler
+    pipenv shell
+
+    zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+    # 34.9M 3:02:32 [3.19k/s]
+
+    cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    # 34.9M 5:23:15 [1.80k/s]
+    # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            -- take "all time" instead of just this recent capture
+            -- AND date(ingest_request.created) > '2021-01-01'
+            AND (ingest_file_result.status IS NULL
+                OR ingest_file_result.status = 'no-capture')
+    ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+    => COPY 6025671
+
+    # transform
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+    # 6.03M 0:03:26 [29.1k/s]
+
+    # enqueue for bulk processing
+    cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND date(ingest_request.created) > '2022-04-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                ingest_request.ingest_type = 'pdf'
+                -- AND date(ingest_request.created) > '2022-04-01'
+                AND ingest_request.link_source = 'unpaywall'
+                AND (ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%.archive.org%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+
+Prep ingest requests (for post-crawl use):
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+
+And actually dump seedlist(s):
+
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+    wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.*.txt
+
+Then run crawl (see `journal-crawls` git repo), including frontier generation.
author	Bryan Newbold <bnewbold@archive.org>	2022-04-20 16:32:43 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-04-20 16:32:43 -0700
commit	47a064c2cc10874aed3a3de7160c92d51039a2a8 (patch)
tree	cd4098b6e4746027a3fa8476ec9c43819ec396e4
parent	b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7 (diff)
download	sandcrawler-47a064c2cc10874aed3a3de7160c92d51039a2a8.tar.gz sandcrawler-47a064c2cc10874aed3a3de7160c92d51039a2a8.zip