27 files changed, 4882 insertions, 5 deletions
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+    https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+    https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+    https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+    sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+                     sha1hex                  |            updated            | grobid_version | status_code |     status     | fatcat_release |                                metadata
+    ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+     d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 |                |         200 | error          |                | {"error_msg": "response XML too large: 12052192 bytes"}
+     8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 |                |         200 | error          |                | {"error_msg": "response XML too large: 18758248 bytes"}
+     227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+        FIXED
+     f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+        https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+        FIXED
+     c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "mismatched tag: line 198, column 3"}
+        https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+        FIXED (and good)
+     4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "unclosed token: line 812, column 7"}
+        https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+        FIXED
+        metadata quality mixed, but complex document (?)
+     7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+        https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+        FIXED
+     088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+        https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+        FIXED
+     19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+        not found
+     acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+        https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+        BROKEN: not well-formed (invalid token): line 60, column 45
+            <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+     8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+        not found
+     c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+        https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+        BROKEN: not well-formed (invalid token): line 58, column 45
+            <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+     840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+        not found
+     3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+        not found
+     f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+        https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+        FIXED, good
+     f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+        FIXED
+     37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+        https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+        FIXED
+     3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+        FIXED
+    (21 rows)
+
+Some other errors from other queries:
+
+     d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 |                |         500 | error  |                | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+        https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+        FIXED: with 0.7.0+
+
+     56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 |                |         500 | error  |                | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+        https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+        still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+        BAD PDF ("no pages" in evince)
+
+     d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 |                |         500 | error  |                | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+        https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+        FIXED
+
+     51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00  |                |         503 | error  |                | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+        https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+        FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+    .data.id
+    .data.latestVersion.datasetPersistentId
+    .data.latestVersion.versionNumber, .versionMinorNumber
+    .data.latestVersion.files[]
+        .dataFile
+            .contentType (mimetype)
+            .filename
+            .filesize (int, bytes)
+            .md5
+            .persistendId
+            .description
+        .label (filename?)
+        .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+    live web
+    no longer active
+    http://www.dlib.org/back.html
+
+NLM technical bulletin
+    https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+    https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+    live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+    issn:2551-1289
+    vanished
+    blog format
+    http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+    https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+    https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+    ISSN: 1529-3041
+
+    defunct since ~2010
+    simple HTML articles
+    references
+    http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+    http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+    PDF articles (not HTML)
+    http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+    zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+    => 2,184,954
+
+    cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+     254817 link.springer.com
+     145159 www.scielo.br
+      78044 journal.frontiersin.org
+      77394 www.frontiersin.org
+      40849 www.dovepress.com
+      19024 dergipark.org.tr
+      18758 periodicos.ufsc.br
+      16346 www.revistas.usp.br
+      15872 revistas.unal.edu.co
+      15527 revistas.ucm.es
+      13669 revistas.usal.es
+      12640 dergipark.gov.tr
+      12111 journals.rudn.ru
+      11839 www.scielosp.org
+      11277 www.karger.com
+      10827 www.journals.vu.lt
+      10318 
+       9854 peerj.com
+       9100 ojs.unud.ac.id
+       8581 jurnal.ugm.ac.id
+       8261 riviste.unimi.it
+       8012 journals.uran.ua
+       7454 revistas.pucp.edu.pe
+       7264 journals.vgtu.lt
+       7200 publicaciones.banrepcultural.org
+
+    cat html_fulltext_urls.txt \
+        | rg -v link.springer.com \
+        | rg -v scielo \
+        | rg -v dergipark.gov.tr \
+        | rg -v frontiersin.org \
+        > html_fulltext_urls.filtered.txt
+    => 1,579,257
+
+    zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+    => 560k
+
+    cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+      40849 www.dovepress.com
+      10570 journals.rudn.ru
+      10494 dergipark.org.tr
+      10233 revistas.unal.edu.co
+       9981 dergipark.gov.tr
+       9428 revistas.usal.es
+       8292 revistas.ucm.es
+       7200 publicaciones.banrepcultural.org
+       6953 revistas.pucp.edu.pe
+       6000 www.scielosp.org
+       5962 www.scielo.br
+       5621 www.richtmann.org
+       5123 scielo.sld.cu
+       5067 ojs.unud.ac.id
+       4838 periodicos.ufsc.br
+       4736 revistasonlinepre.inap.es
+       4486 journal.fi
+       4221 www.seer.ufu.br
+       3553 revistas.uam.es
+       3492 revistas.pucsp.br
+       3060 www.scielo.org.co
+       2991 scielo.isciii.es
+       2802 seer.ufrgs.br
+       2692 revistas.unc.edu.ar
+       2685 srl.si
+
+    cat html_fulltext_urls.no_doi.txt \
+        | rg -v link.springer.com \
+        | rg -v scielo \
+        | rg -v dergipark.gov.tr \
+        | rg -v frontiersin.org \
+        > html_fulltext_urls.no_doi.filtered.txt
+    => 518,608
+
+    zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+        https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+        https://journal.umy.ac.id/index.php/st/article/view/3297
+        https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+        http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+        http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+        https://journal.fi/inf/article/view/59430
+        http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+        https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+        https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+        http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+        http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+        http://journal.bdfish.org/index.php/fisheries/article/view/91
+        https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+        https://www.lithosphere.ru/jour/article/view/779
+        https://journals.hioa.no/index.php/seminar/article/view/2412
+        http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+        https://www.kmuj.kmu.edu.pk/article/view/15698
+        http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+        https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+        http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
index fded7b3..ac808dd 100644
--- a/notes/ingest/2021-09-02_oai_pmh_patch.md
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH
 harvest/transform time and filter them out.
 
 It may be worthwhile to attempt ingest of multiple existing captures
-(timestamps) in the ingest pipeline.  Eg, isntead of chosing a single "best"
-capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(timestamps) in the ingest pipeline.  Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
 (or at least a couple).  This is because repository software gets upgraded, so
 old "no-capture" or "not found" or "link loop" type captures may work when
 recrawled.
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
index f63e524..d36f427 100644
--- a/notes/ingest/2021-09-03_patch_crawl.md
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -482,7 +482,197 @@ Note that this is just seedlists, not full ingest requests.
 
 Then run the actual patch crawl!
 
-## Ingest Requests for Bulk Retry
+## Ingest Requests for Bulk Retry (2022-01-06)
+
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_file_result.updated <= '2022-01-01'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+            )
+            AND (
+                ingest_request.link_source = 'oai'
+                OR (
+                    ingest_request.link_source = 'doi'
+                    AND (
+                        ingest_request.ingest_request_source = 'fatcat-ingest'
+                        OR ingest_request.ingest_request_source = 'fatcat-changelog'
+                    )
+                )
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+    => 4,488,193
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+    => DONE
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => TIMEDOUT
+    => (probably due to re-assignment)
+    => DONE
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+        AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+        AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+        AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+        AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+        AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+        AND ingest_request.base_url NOT LIKE '%doaj.org%'
+        AND ingest_request.base_url NOT LIKE '%orcid.org%'
+        AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+        AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+On 2022-02-08:
+
+            status         |  count
+    -----------------------+----------
+     success               | 13505143
+     no-pdf-link           |  8741007
+     no-capture            |  4429986
+     redirect-loop         |  1566611
+     terminal-bad-status   |   816162
+     link-loop             |   459006
+     wrong-mimetype        |   448983
+     null-body             |    71871
+     cdx-error             |    19055
+                           |    15275
+     petabox-error         |    11713
+     blocked-cookie        |    11664
+     wayback-error         |     8745
+     skip-url-blocklist    |     7828
+     max-hops-exceeded     |     2031
+     wayback-content-error |      338
+     body-too-large        |      280
+     spn2-error:job-failed |      191
+     bad-redirect          |      134
+     redirects-exceeded    |      120
+    (20 rows)
+
+
+On 2022-02-28, after bulk ingest completed:
+
+            status         |  count   
+    -----------------------+----------
+     success               | 14668123
+     no-pdf-link           |  8822460
+     no-capture            |  2987565
+     redirect-loop         |  1629015
+     terminal-bad-status   |   917851
+     wrong-mimetype        |   466512
+     link-loop             |   460941
+     null-body             |    71457
+     cdx-error             |    19636
+     petabox-error         |    16198
+                           |    15275
+     blocked-cookie        |    11885
+     wayback-error         |     8779
+     skip-url-blocklist    |     7838
+     empty-blob            |     5906
+     max-hops-exceeded     |     5563
+     wayback-content-error |      355
+     body-too-large        |      329
+     spn2-error:job-failed |      191
+     bad-redirect          |      137
+    (20 rows)
+
+
+Comparing to a couple months ago:
+
+    14668123-13258356 = +1,409,767  success
+    8822460-8685519 =   +  136,941  no-pdf-link
+    2987565-4765663 =   -1,778,098  no-capture
+    917851-803373 =     +  114,478  terminal-bad-status
 
-TODO: for each of the link sources mentioned at top, do a separate query by
-source to re-ingest.
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..786c3b2
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,504 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+    wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+          2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+       1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+       2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+      10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+      10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+    cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+    # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+    ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+        | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+        | shuf -n10000 \
+        | pv -l \
+        > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+    # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+    ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+        | rg '10\.5281/zenodo' \
+        | shuf -n10000 \
+        | pv -l \
+        > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+    ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+        | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+        | shuf \
+        | pv -l \
+        > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+    # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739})                                                                       # 1.7k 0:01:29 [  19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+    ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+        | rg '10\.7910/dvn/[a-z0-9]{6}' \
+        | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+        | shuf -n10000 \
+        | pv -l \
+        > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+    # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000})                                                                      # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+    {
+        "ingest_type": "dataset",
+        "ingest_request_source": "savepapernow",
+        "base_url": "https://archive.org/details/CAT_DATASET",
+        "release_stage": "published",
+        "fatcat": {
+            "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+            "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+        },
+        "ext_ids": {},
+        "link_source": "spn",
+        "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+    }
+    {
+        "ingest_type": "dataset",
+        "ingest_request_source": "savepapernow",
+        "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+        "release_stage": "published",
+        "fatcat": {
+            "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+            "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+        },
+        "ext_ids": {},
+        "link_source": "spn",
+        "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+    }
+
+    # paste and then Ctrl-D:
+    cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+    # first a small sample
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | head -n5 \
+        | pv -l \
+        | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+    # ok, run the whole batch through
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | pv -l \
+        | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+    internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+    AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+    cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+        | rg '"no-capture"' \
+        | rg -v '"manifest"' \
+        | jq 'select(.status = "no-capture")' -c \
+        | jq .request.base_url -r \
+        | pv -l \
+        > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+    cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+        | rg '"no-capture"' \
+        | rg '"manifest"' \
+        | jq 'select(.status = "no-capture")' -c \
+        | rg '"web-' \
+        | jq .manifest[].terminal_url -r \
+        | pv -l \
+        > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+    File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+        internetarchive.upload
+    [...]
+    ConnectionResetError: [Errno 104] Connection reset by peer
+    urllib3.exceptions.ProtocolError
+    requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+    Traceback (most recent call last):
+      File "./ingest_tool.py", line 208, in <module>
+        main()
+      File "./ingest_tool.py", line 204, in main
+        args.func(args)
+      File "./ingest_tool.py", line 57, in run_requests
+        result = fileset_worker.process(request)
+      File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+        archive_result = strategy_helper.process(dataset_meta)
+      File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+        r.raise_for_status()
+      File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status  
+        raise HTTPError(http_error_msg, response=self)
+    requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+    Traceback (most recent call last):
+      File "./ingest_tool.py", line 208, in <module>
+        main()
+      File "./ingest_tool.py", line 204, in main
+        args.func(args)
+      File "./ingest_tool.py", line 57, in run_requests
+        result = fileset_worker.process(request)
+      File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+        archive_result = strategy_helper.process(dataset_meta)
+      File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+        file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+      File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+        mimetype = magic.Magic(mime=True).from_file(path)
+      File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+        with _real_open(filename):
+    FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+    Traceback (most recent call last):
+      File "./ingest_tool.py", line 208, in <module>
+        main()
+      File "./ingest_tool.py", line 204, in main
+        args.func(args)
+      File "./ingest_tool.py", line 57, in run_requests
+        result = fileset_worker.process(request)
+      File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+        dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+      File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+        obj_latest = obj["data"]["latestVersion"]
+    KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+    git log | head -n1
+    # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+    Wed Dec 15 21:57:42 UTC 2021
+
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | shuf \
+        | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | rg -v 10.5281 \
+        | shuf \
+        | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+    # 3.76k 15:12:53 [68.7m/s]
+
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | rg -v 10.5281 \
+        | shuf \
+        | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+    wc -l ingest_dataset_combined_results*.json
+         126 ingest_dataset_combined_results2.json
+         153 ingest_dataset_combined_results3.json
+         275 ingest_dataset_combined_results4.json
+        3762 ingest_dataset_combined_results5.json
+        7736 ingest_dataset_combined_results6.json
+         182 ingest_dataset_combined_results.json
+           5 ingest_dataset_combined_results.ramp.json
+       12239 total
+
+    cat ingest_dataset_combined_results*.json \
+        | rg '^\{' \
+        | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+        | sort \
+        | uniq --check-chars 26 \
+        | cut -f2 \
+        | rg -v '\\\\' \
+        | pv -l \
+        > uniq_ingest_dataset_combined_results.json
+    # 9.48k 0:00:06 [1.54k/s]
+
+    cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+       7941 no-capture
+        374 platform-404
+        369 terminal-bad-status
+        348 success-file
+        172 success
+         79 platform-scope
+         77 error-platform-download
+         47 empty-manifest
+         27 platform-restricted
+         20 too-many-files
+         12 redirect-loop
+          6 error-archiveorg-upload
+          3 too-large-size
+          3 mismatch
+          1 no-platform-match
+
+    cat uniq_ingest_dataset_combined_results.json \
+        | rg '"success' \
+        | jq 'select(.status == "success") | .' -c \
+        > uniq_ingest_dataset_combined_results.success.json
+
+    cat uniq_ingest_dataset_combined_results.json \
+        | rg '"success' \
+        | jq 'select(.status == "success-file") | .' -c \
+        > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+    git log | head -n1
+    # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+    head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-file-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+    head /tmp/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+Trying again 2022-03-23:
+
+    git log | head -n1
+    # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-file-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+    head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-fileset-file-results -
+    # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+    ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+        | pv -l \
+        | gzip \
+        > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+    # Expecting 11264787 release objects in search queries
+    # TIMEOUT ERROR
+    # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+    zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+        | jq .base_url -r \
+        | sort -u \
+        | shuf \
+        | awk '{print "F+ " $1}' \
+        > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+    cat ingest_dataset_combined_results6.json \
+        | rg '"no-capture"' \
+        | jq 'select(.status = "no-capture")' -c \
+        | jq .request -c \
+        | pv -l \
+        > ingest_dataset_retry.json
+    => 6.51k 0:00:01 [3.55k/s]
+
+    cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+        | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
+## Retries (2022-02)
+
+Finally got things to complete end to end for this batch!
+
+    cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr
+       3220 terminal-bad-status
+       2120 no-capture
+        380 empty-manifest
+        264 success-file
+        251 success
+        126 success-existing
+         39 mismatch
+         28 error-platform-download
+         24 too-many-files
+         20 platform-scope
+         13 platform-restricted
+         13 mismatch-size
+          6 too-large-size
+          3 transfer-encoding-error
+          2 no-platform-match
+          2 error-archiveorg-upload
+          1 redirect-loop
+          1 empty-blob
+
+Some more URLs to crawl:
+
+    cat ingest_dataset_retry_results5.json \
+        | rg '"no-capture"' \
+        | rg -v '"manifest"' \
+        | jq 'select(.status = "no-capture")' -c \
+        | jq .request.base_url -r \
+        | pv -l \
+        > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt
+    # 1.00
+    # just a single DOI that failed to crawl, for whatever reason
+
+    cat ingest_dataset_retry_results5.json \
+        | rg '"no-capture"' \
+        | rg '"manifest"' \
+        | jq 'select(.status = "no-capture")' -c \
+        | rg '"web-' \
+        | jq .manifest[].terminal_url -r \
+        | pv -l \
+        > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt
+
+These are ready to crawl, in the existing dataset crawl.
+
+    cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \
+        | sort -u \
+        | shuf \
+        | awk '{print "F+ " $1}' \
+        > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+    # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+    # skip_cleanup_local_files=True is still default
+
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | shuf \
+        | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+    # filter out zenodo, very slow:
+    # rg -v 10.5281 \
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..941519f
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,398 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below), 
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+    COPY (
+        SELECT ingest_file_result.terminal_url
+        -- SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            (
+                ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'html'
+            )
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+                OR (
+                    ingest_file_result.status = 'terminal-bad-status'
+                    AND (
+                        ingest_file_result.terminal_status_code = 429
+                        OR ingest_file_result.terminal_status_code = 500
+                        OR ingest_file_result.terminal_status_code = 502
+                        OR ingest_file_result.terminal_status_code = 503
+                    )
+                )
+            )
+            AND (
+                ingest_request.link_source = 'oai'
+                OR ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'arxiv'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+                OR ingest_request.link_source = 'pmc'
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+    -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+    ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+    => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+    cat patch_terminal_url.2022-01-12.txt \
+        | rg -v www.archive.org \
+        | rg '://' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | rg -i '^http' \
+        | sort -u -S 4G \
+        | pv -l \
+        > patch_terminal_url.2022-01-12.uniq.txt
+    => 5.73M 0:00:47 [ 120k/s]
+
+    # note: tweaks and re-ran the above after inspecting this output
+    cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+     799045 doi.org
+     317557 linkinghub.elsevier.com
+     211091 arxiv.org
+     204334 iopscience.iop.org
+     139758 dialnet.unirioja.es
+     130331 www.scielo.br
+     124626 www.persee.fr
+      85764 digitalrepository.unm.edu
+      83913 www.mdpi.com
+      79662 www.degruyter.com
+      75703 www.e-periodica.ch
+      72206 dx.doi.org
+      69068 escholarship.org
+      67848 idus.us.es
+      57907 zenodo.org
+      56624 ir.opt.ac.cn
+      54983 projecteuclid.org
+      52226 rep.bntu.by
+      48376 osf.io
+      48009 pubs.rsc.org
+      46947 publikationen.ub.uni-frankfurt.de
+      45564 www.research-collection.ethz.ch
+      45153 dk.um.si
+      43313 www.ssoar.info
+      40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+    COPY (
+        SELECT ingest_file_result.terminal_url
+        -- SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            (
+                ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'html'
+            )
+            AND ingest_file_result.updated >= '2022-01-12'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+                OR (
+                    ingest_file_result.status = 'terminal-bad-status'
+                    AND (
+                        ingest_file_result.terminal_status_code = 429
+                        OR ingest_file_result.terminal_status_code = 500
+                        OR ingest_file_result.terminal_status_code = 502
+                        OR ingest_file_result.terminal_status_code = 503
+                    )
+                )
+            )
+            AND (
+                ingest_request.link_source = 'oai'
+                OR ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'arxiv'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+                OR ingest_request.link_source = 'pmc'
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+    -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+    ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+    => COPY 444764
+
+    cat patch_terminal_url.2022-02-08.txt \
+        | rg -v www.archive.org \
+        | rg '://' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | rg -i '^http' \
+        | sort -u -S 4G \
+        | pv -l \
+        > patch_terminal_url.2022-02-08.uniq.txt
+    => 426k 0:00:04 [ 103k/s]
+
+    cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+      60123 www.degruyter.com
+      59314 arxiv.org
+      43674 zenodo.org
+      17771 doi.org
+       9501 linkinghub.elsevier.com
+       9379 www.mdpi.com
+       5691 opendata.uni-halle.de
+       5578 scholarlypublishingcollective.org
+       5451 era.library.ualberta.ca
+       4982 www.cairn.info
+       4306 www.taylorfrancis.com
+       4189 papers.ssrn.com
+       4157 apps.crossref.org
+       4089 www.sciencedirect.com
+       4033 mdpi-res.com
+       3763 dlc.mpg.de
+       3408 osf.io
+       2603 www.frontiersin.org
+       2594 watermark.silverchair.com
+       2569 journals.lww.com
+       1787 underline.io
+       1680 archiviostorico.fondazione1563.it
+       1658 www.jstage.jst.go.jp
+       1611 cyberleninka.ru
+       1535 www.schoeningh.de
+
+    cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+    => Done
+
+Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+    COPY (
+        -- SELECT ingest_file_result.terminal_url
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            (
+                ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'html'
+            )
+            AND ingest_file_result.updated <= '2022-02-08'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'wayback-content-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+                OR ingest_file_result.status = 'gateway-timeout'
+                OR (
+                    ingest_file_result.status = 'terminal-bad-status'
+                    AND (
+                        ingest_file_result.terminal_status_code = 429
+                        OR ingest_file_result.terminal_status_code = 500
+                        OR ingest_file_result.terminal_status_code = 502
+                        OR ingest_file_result.terminal_status_code = 503
+                    )
+                )
+            )
+            AND (
+                -- ingest_request.link_source = 'oai'
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'arxiv'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+                OR ingest_request.link_source = 'pmc'
+            )
+
+            AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+            AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+            AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+            -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+            AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+            AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+            AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+            -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+            AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+            -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+            AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+    # COPY 3053219
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+    => DONE
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => DONE
+
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..a6f08dd
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,248 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+    cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+    https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+    https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+    https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+    https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+    https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+    https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+    https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+    https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+    https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+    https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+    https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+    https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+    https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+    https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+    https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+    https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+    https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+    https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+    https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+    https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_file_result.status = 'no-pdf-link'
+            AND (
+                ingest_file_result.terminal_url LIKE '%/article/view/%'
+                OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+            )
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+    => COPY 326577
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+    cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+    COPY (
+        SELECT ingest_file_result.terminal_url
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND (
+                ingest_file_result.status = 'redirect-loop'
+                OR ingest_file_result.status = 'link-loop'
+            )
+            AND (
+                ingest_file_result.terminal_url LIKE '%/article/view/%'
+                OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+    => COPY 342415
+
+    cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_file_result.status = 'no-pdf-link'
+            AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+    # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+    fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+    30,938,106
+
+    fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+    6,664,347
+
+    fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+    8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+    ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+        | pv -l \
+        | gzip \
+        > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+    # re-running 2022-02-08 after this VM was upgraded
+    # Expecting 8321448 release objects in search queries
+    # DONE
+
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
+
+## Seeds: HTML and XML links from HTML biblio
+
+    kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+        | pv -l \
+        | rg '"(html|xml)_fulltext_url"' \
+        | rg '"no-pdf-link"' \
+        | gzip \
+        > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+    # cut this off at some point? gzip is terminated weird
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+    # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+    # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+        | jq .html_biblio.xml_fulltext_url -r \
+        | rg '://' \
+        | sort -u -S 4G \
+        | pv -l \
+        | gzip \
+        > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+    # 1.24M 0:01:35 [12.9k/s]
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+        | jq .html_biblio.html_fulltext_url -r \
+        | rg '://' \
+        | sort -u -S 4G \
+        | pv -l \
+        | gzip \
+        > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+    # 549k 0:01:27 [6.31k/s]
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+        | cut -f3 -d/ \
+        | sort -S 4G \
+        | uniq -c \
+        | sort -nr \
+        | head -n20
+
+     534005 dlc.library.columbia.edu
+     355319 www.degruyter.com
+     196421 zenodo.org
+     101450 serval.unil.ch
+     100631 biblio.ugent.be
+      47986 digi.ub.uni-heidelberg.de
+      39187 www.emerald.com
+      33195 www.cairn.info
+      25703 boris.unibe.ch
+      19516 journals.openedition.org
+      15911 academic.oup.com
+      11091 repository.dl.itc.u-tokyo.ac.jp
+       9847 oxfordworldsclassics.com
+       9698 www.thieme-connect.de
+       9552 www.idunn.no
+       9265 www.zora.uzh.ch
+       8030 www.scielo.br
+       6543 www.hanspub.org
+       6229 asmedigitalcollection.asme.org
+       5651 brill.com
+
+    zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+        | awk '{print "F+ " $1}' \
+        > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+    wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+    1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.
+
+TODO: generate this list
+
+## Non-OA DOI Bulk Ingest
+
+Had previously run:
+
+    cat ingest_nonoa_doi.json.gz \
+        | rg -v "doi.org/10.2139/" \
+        | rg -v "doi.org/10.1021/" \
+        | rg -v "doi.org/10.1121/" \
+        | rg -v "doi.org/10.1515/" \
+        | rg -v "doi.org/10.1093/" \
+        | rg -v "europepmc.org" \
+        | pv -l \
+        | gzip \
+        > nonoa_doi.filtered.ingests.json.gz
+    # 7.35M 0:01:13 [99.8k/s]
+
+Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has
+entirely finished, but after almost all queues (domains) have been done for
+several days.
+
+    zcat nonoa_doi.filtered.ingests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks like many jstage `no-capture` status; these are still (slowly) crawling.
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..9722459
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,278 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+    # on sandcrawler-vm
+    mkdir -p /srv/sandcrawler/tasks/doaj
+    cd /srv/sandcrawler/tasks/doaj
+    wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+    # in pipenv, in python directory
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+    # 9.08M 0:37:38 [4.02k/s]
+
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+    # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+     ingest_type |          status          |  count  
+    -------------+--------------------------+---------
+     pdf         | success                  | 2919808
+     html        | wrong-scope              | 1098998
+     pdf         | no-pdf-link              |  481532
+     pdf         | redirect-loop            |  429006
+     html        | success                  |  342501
+     html        | unknown-scope            |  225390
+     html        | redirect-loop            |  223927
+     html        | html-resource-no-capture |  187762
+     html        | no-capture               |  185418
+     pdf         | no-capture               |  171273
+     pdf         | null-body                |  129028
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91551
+     pdf         | link-loop                |   25447
+     html        | wrong-mimetype           |   22640
+     html        | wayback-content-error    |   19028
+     html        | terminal-bad-status      |   13327
+     pdf         | wrong-mimetype           |    7688
+     xml         | success                  |    6897
+     html        | petabox-error            |    5529
+     pdf         | wayback-error            |    2706
+     xml         | null-body                |    2353
+     pdf         |                          |    2063
+     pdf         | wayback-content-error    |    1349
+     html        | cdx-error                |    1169
+     pdf         | cdx-error                |    1130
+     pdf         | petabox-error            |     679
+     html        |                          |     620
+     pdf         | empty-blob               |     562
+     html        | blocked-cookie           |     545
+    (30 rows)
+
+After the above load:
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3036457
+     pdf         |                          | 1623208
+     html        |                          | 1208412
+     html        | wrong-scope              | 1108132
+     pdf         | no-pdf-link              |  485703
+     pdf         | redirect-loop            |  436085
+     html        | success                  |  342594
+     html        | unknown-scope            |  225412
+     html        | redirect-loop            |  223927
+     html        | html-resource-no-capture |  187999
+     html        | no-capture               |  187310
+     pdf         | no-capture               |  172033
+     pdf         | null-body                |  129266
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91799
+     pdf         | link-loop                |   26933
+     html        | wrong-mimetype           |   22643
+     html        | wayback-content-error    |   19028
+     html        | terminal-bad-status      |   13327
+     xml         |                          |   11196
+     pdf         | wrong-mimetype           |    7929
+     xml         | success                  |    6897
+     html        | petabox-error            |    5530
+     pdf         | wayback-error            |    2707
+     xml         | null-body                |    2353
+     pdf         | wayback-content-error    |    1353
+     pdf         | cdx-error                |    1177
+     html        | cdx-error                |    1172
+     pdf         | petabox-error            |     771
+     pdf         | empty-blob               |     562
+    (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+    => COPY 353819
+
+Not that many! Guess the filters are important?
+
+    SELECT COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.base_url = ingest_request.base_url
+        AND ingest_file_result.ingest_type = ingest_request.ingest_type
+    WHERE
+        ingest_request.link_source = 'doaj'
+        -- AND (ingest_request.ingest_type = 'pdf'
+        --    OR ingest_request.ingest_type = 'xml')
+        AND (
+            ingest_file_result.status IS NULL
+            OR ingest_file_result.status = 'no-capture'
+        );
+    => 3202164
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+    => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+    # COPY 350661
+
+And stats again:
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3037059
+     pdf         |                          | 1623208
+     html        |                          | 1208412
+     html        | wrong-scope              | 1108476
+     pdf         | no-pdf-link              |  485705
+     pdf         | redirect-loop            |  436850
+     html        | success                  |  342762
+     html        | unknown-scope            |  225412
+     html        | redirect-loop            |  224683
+     html        | html-resource-no-capture |  188058
+     html        | no-capture               |  185734
+     pdf         | no-capture               |  170452
+     pdf         | null-body                |  129266
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91875
+     pdf         | link-loop                |   26933
+     html        | wrong-mimetype           |   22643
+     html        | wayback-content-error    |   19042
+     html        | terminal-bad-status      |   13333
+     xml         |                          |   11196
+     pdf         | wrong-mimetype           |    7929
+     xml         | success                  |    6898
+     html        | petabox-error            |    5535
+     pdf         | wayback-error            |    2711
+     xml         | null-body                |    2353
+     pdf         | wayback-content-error    |    1353
+     pdf         | cdx-error                |    1177
+     html        | cdx-error                |    1172
+     pdf         | petabox-error            |     772
+     html        | blocked-cookie           |     769
+    (30 rows)
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+        | jq -r .base_url \
+        | sort -u -S 4G \
+        > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
+
+
+## Bulk Ingest
+
+After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up.
+
+    # 2022-03-22
+    cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+    zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+        | rg -v 'oai:kb.dk:' \
+        | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+        | rg -v 'oai:hispana.mcu.es:' \
+        | rg -v 'oai:bnf.fr:' \
+        | rg -v 'oai:ukm.si:' \
+        | rg -v 'oai:biodiversitylibrary.org:' \
+        | rg -v 'oai:hsp.org:' \
+        | rg -v 'oai:repec:' \
+        | rg -v 'oai:n/a:' \
+        | rg -v 'oai:quod.lib.umich.edu:' \
+        | rg -v 'oai:americanae.aecid.es:' \
+        | rg -v 'oai:www.irgrid.ac.cn:' \
+        | rg -v 'oai:espace.library.uq.edu:' \
+        | rg -v 'oai:edoc.mpg.de:' \
+        | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+        | rg -v 'oai:repository.erciyes.edu.tr:' \
+        | rg -v 'oai:krm.or.kr:' \
+        | ./scripts/oai2ingestrequest.py - \
+        | pv -l \
+        | gzip \
+        > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..23fd35f
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,144 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+    export PATCHDATE=2022-04-20
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+    COPY (
+        SELECT row_to_json(t) FROM (
+            SELECT ingest_file_result.terminal_url, ingest_request.*
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                (
+                    ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'html'
+                )
+                -- AND ingest_file_result.updated >= '2022-01-12'
+                AND (
+                    ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'wayback-content-error'
+                    OR ingest_file_result.status = 'petabox-error'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR (
+                        ingest_file_result.status = 'terminal-bad-status'
+                        AND (
+                            ingest_file_result.terminal_status_code = 429
+                            OR ingest_file_result.terminal_status_code = 500
+                            OR ingest_file_result.terminal_status_code = 502
+                            OR ingest_file_result.terminal_status_code = 503
+                        )
+                    )
+                )
+                AND (
+                    ingest_request.link_source = 'doi'
+                    OR ingest_request.link_source = 'arxiv'
+                    OR ingest_request.link_source = 'doaj'
+                    OR ingest_request.link_source = 'dblp'
+                    OR ingest_request.link_source = 'pmc'
+                    -- OR ingest_request.link_source = 'unpaywall'
+                    -- OR ingest_request.link_source = 'oai'
+                )
+
+                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+        ) t
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+    # COPY 4842749
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+        | rg -v "\\\\" \
+        | jq -r .terminal_url \
+        | rg '://' \
+        | rg -i '^http' \
+        | rg -v www.archive.org \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+    # 4.75M 0:01:44 [45.4k/s]
+
+    # check top domains
+    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+    1515829 www.jstage.jst.go.jp
+    1052953 doi.org
+     241704 arxiv.org
+     219543 www.sciencedirect.com
+     178562 www.persee.fr
+      84947 zenodo.org
+      67397 www.mdpi.com
+      65775 journals.lww.com
+      58216 opg.optica.org
+      50673 osf.io
+      45776 www.degruyter.com
+      36664 www.indianjournals.com
+      35287 pubs.rsc.org
+      33495 www.bmj.com
+      33320 www.research-collection.ethz.ch
+      29728 www.e-periodica.ch
+      28338 iopscience.iop.org
+      26364 www.cambridge.org
+      23840 onlinelibrary.wiley.com
+      23641 platform.almanhal.com
+      22660 brill.com
+      20288 www.osapublishing.org
+      18561 cgscholar.com
+      18539 doi.nrct.go.th
+      15677 www.frontiersin.org
+
+    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+    cd /srv/sandcrawler/src/python
+    sudo su sandcrawler
+    pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+    => 4.84M 0:03:14 [24.9k/s]
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => started 2022-05-11
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..bc78998
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,278 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+    export SNAPSHOT=2022-03-09
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
+## Download and Archive
+
+    wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+    # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+    export SNAPSHOT=2022-03-09
+    ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+    # if needed
+    scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+    # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+    cd /srv/sandcrawler/src/python
+    sudo su sandcrawler
+    pipenv shell
+
+    zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+    # 34.9M 3:02:32 [3.19k/s]
+
+    cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    # 34.9M 5:23:15 [1.80k/s]
+    # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            -- take "all time" instead of just this recent capture
+            -- AND date(ingest_request.created) > '2021-01-01'
+            AND (ingest_file_result.status IS NULL
+                OR ingest_file_result.status = 'no-capture')
+    ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+    => COPY 6025671
+
+    # transform
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+    # 6.03M 0:03:26 [29.1k/s]
+
+    # enqueue for bulk processing
+    cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND date(ingest_request.created) > '2022-04-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+---------
+     no-capture              | 3330232
+     success                 | 2455102
+     redirect-loop           |  197117
+     terminal-bad-status     |   82618
+     no-pdf-link             |   33046
+     blocked-cookie          |   16078
+     link-loop               |    6745
+     wrong-mimetype          |    3416
+     wayback-error           |    1385
+     empty-blob              |    1142
+     cdx-error               |     820
+     body-too-large          |     292
+     bad-gzip-encoding       |     281
+     wayback-content-error   |     267
+                             |     253
+     petabox-error           |     215
+     skip-url-blocklist      |     185
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+             status          |  count
+    -------------------------+---------
+     no-capture              | 3330055
+     success                 | 2455279
+     redirect-loop           |  197117
+     terminal-bad-status     |   82618
+     no-pdf-link             |   33046
+     blocked-cookie          |   16079
+     link-loop               |    6745
+     wrong-mimetype          |    3416
+     wayback-error           |    1385
+     empty-blob              |    1142
+     cdx-error               |     820
+     body-too-large          |     292
+     bad-gzip-encoding       |     281
+     wayback-content-error   |     267
+                             |     253
+     petabox-error           |     215
+     skip-url-blocklist      |     185
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                ingest_request.ingest_type = 'pdf'
+                -- AND date(ingest_request.created) > '2022-04-01'
+                AND ingest_request.link_source = 'unpaywall'
+                AND (ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%.archive.org%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org%'
+                AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+    => before ingest and arxiv.org DOI exclusion: COPY 3309091
+    => COPY 3308914
+
+
+Prep ingest requests (for post-crawl use):
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+    => 3.31M 0:02:22 [23.2k/s]
+
+And actually dump seedlist(s):
+
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+    wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+            15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+       3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+       3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+       3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+    scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c |  sort -nr | head -n20
+     158497 www.scielo.br
+     144732 onlinelibrary.wiley.com
+     129349 www.researchsquare.com
+      94923 hal.archives-ouvertes.fr
+      69293 openresearchlibrary.org
+      64584 www.cell.com
+      60033 link.springer.com
+      50528 www.degruyter.com
+      49737 projecteuclid.org
+      45841 www.jstage.jst.go.jp
+      44819 www.mdpi.com
+      44325 ieeexplore.ieee.org
+      38091 dr.lib.iastate.edu
+      31030 www.nature.com
+      30300 discovery.ucl.ac.uk
+      27692 ntrs.nasa.gov
+      24215 orca.cardiff.ac.uk
+      23653 www.frontiersin.org
+      23474 pure.rug.nl
+      22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+    # enqueue for bulk processing
+    cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND date(ingest_request.created) > '2022-04-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count  
+    -------------------------+---------
+     success                 | 4784948 => +2,329,669  ~77%
+     redirect-loop           |  485270 => +  288,153  ~10%
+     no-capture              |  317598 => -3,012,457
+     terminal-bad-status     |  267853 => +  185,235  ~ 6%
+     no-pdf-link             |  118303 => +   85,257
+     blocked-cookie          |  111373 => +   95,294
+     skip-url-blocklist      |   19368
+     link-loop               |    9091
+     wrong-mimetype          |    7163
+     cdx-error               |    2516
+     empty-blob              |    1961
+     wayback-error           |    1922
+     body-too-large          |     509
+     petabox-error           |     416
+     wayback-content-error   |     341
+     bad-gzip-encoding       |     281
+                             |     253
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
+
+Groovy!
diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md
new file mode 100644
index 0000000..ec31a7d
--- /dev/null
+++ b/notes/ingest/2022-07-15_ingest_fixes.md
@@ -0,0 +1,831 @@
+
+## HTML `html-resource-no-capture` Fixes
+
+Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors.
+
+SQL query:
+
+    select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100;
+    select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100;
+
+    select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture';
+    => 210,528
+
+http://agroengineering.it/index.php/jae/article/view/568/609
+- old capture, from `20171017204935`
+- missing .css file; seems like an actual case of missing content?
+- TODO: re-crawl/re-ingest when CDX is old
+
+https://www.karger.com/Article/FullText/484130
+- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2
+- resource is live
+- this was from DOI-LANDING crawl, no resources captured
+- TODO: re-crawl
+
+https://www.mdpi.com/1996-1073/13/21/5563/htm
+- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm
+- common crawl capture; no/few resources?
+- TODO: re-crawl
+
+http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en
+- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg
+    not on live web
+- old (2013) wide crawl
+- TODO: re-crawl
+
+http://g3journal.org/lookup/doi/10.1534/g3.116.027730
+- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif
+- old 2018 landing crawl (no resources)
+- TODO: re-crawl
+
+https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full
+- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762"
+- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1
+- archiveteam crawl
+- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page
+
+https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full
+- WORKING
+
+https://doi.org/10.4000/trajectoires.2317
+- redirect: https://journals.openedition.org/trajectoires/2317
+- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces)
+- FIXED
+
+http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en
+- WORKING
+
+https://f1000research.com/articles/9-571/v2
+- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js'
+- added recaptcha.net to blocklist
+- still needs a re-crawl
+- SPN capture, from 2020, but images were missing?
+- re-capture has images (though JS still wonky)
+- TODO: re-crawl with SPN2
+
+http://bio.biologists.org/content/4/9/1163
+- DOI LANDING crawl, no sub-resources
+- TODO: recrawl
+
+http://err.ersjournals.com/content/26/145/170039.full
+- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif
+    on live web
+- 2017 targetted heritrix crawl
+- TODO: recrawl
+
+http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN
+- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg
+- recent archiveteam crawl
+- TODO: recrawl
+
+http://journals.ed.ac.uk/lithicstudies/article/view/1444
+- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081
+- common crawl
+- TODO: recrawl
+
+http://medisan.sld.cu/index.php/san/article/view/495
+- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg
+- this single resource is legit missing
+
+seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests
+
+request sources:
+- fatcat-changelog (doi)
+- fatcat-ingest (doi)
+- doaj
+
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'html'
+            AND ingest_file_result.status = 'html-resource-no-capture'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json';
+    => COPY 210749
+
+    ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json
+
+Try a sample of 300:
+
+    shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Seeing a bunch of:
+
+    ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+    ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+    ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"]
+
+    "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069",
+
+
+    ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"]
+
+These seem to be transfer encoding issues; fixed?
+
+    ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"]
+
+Full batch:
+
+    # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Not running the full batch for now, because there are almost all `wayback-content-error` issues.
+
+    cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l
+    114935
+
+    cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+
+## Redirect Loops
+
+Seems like there might have been a bug in how ingest pipeline dealt with
+multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and
+normalization was happening.
+
+This could be a really big deal because we have over 11 million such ingest
+requests! and may even have stopped crawling domains on the basis of redirect
+looping.
+
+    select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50;
+
+http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf
+- 'skip-url-blocklist'
+- paywall on live web
+
+http://www.redjournal.org/article/S0360301616308276/pdf
+- redirect to 'secure.jbs.elsevierhealth.com'
+- ... but re-crawling with SPNv2 worked
+- TODO: reingest this entire journal with SPNv2
+
+http://www.jmirs.org/article/S1939865415001551/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+http://www.cell.com/article/S0006349510026147/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- TODO: try SPNv2?
+- RECRAWL: success
+
+http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf
+- FIXED: success
+
+http://www.nature.com/articles/hdy1994143.pdf
+- blocked-cookie (idp.nature.com / cookies_not_supported)
+- RECRAWL: gateway-timeout
+
+http://www.thelancet.com/article/S0140673619327606/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf
+- FIXED: success
+
+http://hdl.handle.net/21.11116/0000-0001-B1A2-F
+- FIXED: success
+
+http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf
+- FIXED: success
+
+http://www.jto.org/article/S1556086416329999/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+http://www.jahonline.org/article/S1054139X16303020/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+So, wow wow wow, a few things to do here:
+
+- just re-try all these redirect-loop attempts to update status
+- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time!
+
+Possibly the elsevierhealth stuff will require some deeper fiddling to crawl
+correctly.
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.status = 'redirect-loop'
+            -- AND ingest_request.ingest_type = 'pdf'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json';
+    => COPY 6611342
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json
+
+Start with a sample:
+
+    shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Wow that is a lot of ingest! And a healthy fraction of 'success', almost all
+via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full
+batch:
+
+    cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+TODO: repeat with broader query (eg, OAI-PMH, MAG, etc).
+
+## Other
+
+Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\""
+
+    https://www.cairn.info/static/images//logo/logo-cairn-negatif.png   20220430145322
+    https://www.cairn.info/static/images/logo/logo-cairn-negatif.png    20220430145322
+
+Fixed!
+
+
+## Broken WARC Record?
+
+cdx line:
+
+    net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz
+
+download WARC and run:
+
+    zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20
+
+the WARC record:
+
+    WARC/1.0
+    WARC-Type: revisit
+    WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js
+    WARC-Date: 2022-07-16T08:40:26Z
+    WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB
+    WARC-IP-Address: 13.227.21.220
+    WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    WARC-Truncated: length
+    WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0>
+    Content-Type: application/http; msgtype=response
+    Content-Length: 493
+
+    HTTP/1.1 200 OK
+    Content-Type: application/javascript
+    Content-Length: 512
+    Connection: close
+    Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT
+    Accept-Ranges: bytes
+    Server: AmazonS3
+    Date: Fri, 15 Jul 2022 16:36:08 GMT
+    ETag: "1c28db48d4012f0221b63224a3bb7137"
+    Vary: Accept-Encoding
+    X-Cache: Hit from cloudfront
+    Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront)
+    X-Amz-Cf-Pop: SFO20-C1
+    X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg==
+    Age: 57859
+
+where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines?
+
+## osf.io
+
+    select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30;
+
+             status          | terminal_status_code | count
+    -------------------------+----------------------+-------
+     terminal-bad-status     |                  404 | 92110
+     no-pdf-link             |                  200 | 46932
+     not-found               |                  200 | 20212
+     no-capture              |                      |  8599
+     success                 |                  200 |  7604
+     redirect-loop           |                  301 |  2125
+     terminal-bad-status     |                  503 |  1657
+     cdx-error               |                      |  1301
+     wrong-mimetype          |                  200 |   901
+     terminal-bad-status     |                  410 |   364
+     read-timeout            |                      |   167
+     wayback-error           |                      |   142
+     gateway-timeout         |                      |   139
+     terminal-bad-status     |                  500 |    76
+     spn2-error              |                      |    63
+     spn2-backoff            |                      |    42
+     petabox-error           |                      |    39
+     spn2-backoff            |                  200 |    27
+     redirect-loop           |                  302 |    19
+     terminal-bad-status     |                  400 |    15
+     terminal-bad-status     |                  401 |    15
+     remote-server-error     |                      |    14
+     timeout                 |                      |    11
+     terminal-bad-status     |                      |    11
+     petabox-error           |                  200 |    10
+     empty-blob              |                  200 |     8
+     null-body               |                  200 |     6
+     spn2-error:unknown      |                      |     5
+     redirect-loop           |                  308 |     4
+     spn2-cdx-lookup-failure |                      |     4
+    (30 rows)
+
+Many of these are now non-existant, or datasets/registrations not articles.
+Hrm.
+
+
+## Large DOAJ no-pdf-link Domains
+
+    SELECT
+        substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain,
+        COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_request.ingest_type = ingest_file_result.ingest_type
+        AND ingest_request.base_url = ingest_file_result.base_url
+    WHERE
+        ingest_file_result.status = 'no-pdf-link'
+        AND ingest_request.link_source = 'doaj'
+    GROUP BY
+        domain
+    ORDER BY
+        COUNT(*) DESC
+    LIMIT 50;
+
+                            domain                         | count  
+    -------------------------------------------------------+--------
+     www.sciencedirect.com                                 | 211090
+     auth.openedition.org                                  |  20741
+     journal.frontiersin.org:80                            |  11368
+     journal.frontiersin.org                               |   6494
+     ejde.math.txstate.edu                                 |   4301
+     www.arkat-usa.org                                     |   4001
+     www.scielo.br                                         |   3736
+     www.lcgdbzz.org                                       |   2892
+     revistas.uniandes.edu.co                              |   2715
+     scielo.sld.cu                                         |   2612
+     www.egms.de                                           |   2488
+     journals.lww.com                                      |   2415
+     ter-arkhiv.ru                                         |   2239
+     www.kitlv-journals.nl                                 |   2076
+     www.degruyter.com                                     |   2061
+     jwcn-eurasipjournals.springeropen.com                 |   1929
+     www.cjcnn.org                                         |   1908
+     www.aimspress.com                                     |   1885
+     vsp.spr-journal.ru                                    |   1873
+     dx.doi.org                                            |   1648
+     www.dlib.si                                           |   1582
+     aprendeenlinea.udea.edu.co                            |   1548
+     www.math.u-szeged.hu                                  |   1448
+     dergipark.org.tr                                      |   1444
+     revistas.uexternado.edu.co                            |   1429
+     learning-analytics.info                               |   1419
+     drive.google.com                                      |   1399
+     www.scielo.cl                                         |   1326
+     www.economics-ejournal.org                            |   1267
+     www.jssm.org                                          |   1240
+     html.rhhz.net                                         |   1232
+     journalofinequalitiesandapplications.springeropen.com |   1214
+     revistamedicina.net                                   |   1197
+     filclass.ru                                           |   1154
+     ceramicayvidrio.revistas.csic.es                      |   1152
+     gynecology.orscience.ru                               |   1126
+     www.tobaccoinduceddiseases.org                        |   1090
+     www.tandfonline.com                                   |   1046
+     www.querelles-net.de                                  |   1038
+     www.swjpcc.com                                        |   1032
+     microbiologyjournal.org                               |   1028
+     revistas.usal.es                                      |   1027
+     www.medwave.cl                                        |   1023
+     ijtech.eng.ui.ac.id                                   |   1023
+     www.scielo.sa.cr                                      |   1021
+     vestnik.szd.si                                        |    986
+     www.biomedcentral.com:80                              |    984
+     scielo.isciii.es                                      |    983
+     bid.ub.edu                                            |    970
+     www.meirongtv.com                                     |    959
+    (50 rows)
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5;
+        http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html
+        http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html
+        http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html
+        http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html
+        http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html
+    # plain HTML, not really parse-able
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5;
+        https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913
+        https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909
+        https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717
+        https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158
+        https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216
+    # fixed (embed PDF)
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5;
+        https://doi.org/10.5935/0034-7280.20200075
+        https://doi.org/10.5935/0004-2749.20200071
+        https://doi.org/10.5935/0034-7280.20200035
+        http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014
+        https://doi.org/10.5935/0034-7280.20200047
+    # need recrawls?
+    # then success
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5;
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5;
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5;
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5;
+        https://doi.org/10.3205/16dgnc020
+        http://nbn-resolving.de/urn:nbn:de:0183-19degam1126
+        http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml
+        http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml
+        http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625
+    # mostly abstracts, don't have PDF versions
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5;
+        https://doi.org/10.26442/terarkh201890114-47
+        https://doi.org/10.26442/00403660.2019.12.000206
+        https://journals.eco-vector.com/0040-3660/article/download/32246/pdf
+        https://journals.eco-vector.com/0040-3660/article/download/33578/pdf
+        https://doi.org/10.26442/00403660.2019.12.000163
+    # working, needed recrawls (some force re-crawls)
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5;
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5;
+
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5;
+        https://srl.si/ojs/srl/article/view/2910
+        https://srl.si/ojs/srl/article/view/3640
+        https://srl.si/ojs/srl/article/view/2746
+        https://srl.si/ojs/srl/article/view/2557
+        https://srl.si/ojs/srl/article/view/2583
+    # fixed? (dlib.si)
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5;
+        http://www.jssm.org/vol4/n4/8/v4n4-8text.php
+        http://www.jssm.org/vol7/n1/19/v7n1-19text.php
+        http://www.jssm.org/vol9/n3/10/v9n3-10text.php
+        http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml
+        http://www.jssm.org/vol7/n2/11/v7n2-11text.php
+    # works as an HTML document? otherwise hard to select on PDF link
+
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5;
+        https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism
+        https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay
+        https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach
+        https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad
+        https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre
+    # fixed
+    # TODO: XXX: re-crawl/ingest
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5;
+        https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/
+        https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/
+        https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/
+        https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/
+        https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/
+    # HTML article, no PDF
+    # ... but only sometimes
+
+    select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5;
+        http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878
+        https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act
+        http://dx.doi.org/10.5867/medwave.2012.03.5332
+        https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act
+        http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964
+    # HTML article, no PDF
+
+Re-ingest HTML:
+
+    https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE)
+    https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE)
+
+Re-ingest PDF:
+
+    doi_prefix:10.5935 (DONE)
+    doi_prefix:10.26442
+
+## More Scielo
+
+More scielo?  `doi_prefix:10.5935 in_ia:false`
+
+    http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873
+    # OJS? fixed
+
+    https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240
+    # working, but needed re-crawl
+
+    http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft
+
+A few others, mostly now working
+
+## Recent OA DOIs
+
+    fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446  !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json
+
+    wc -l recent_missing_oa.json
+    24433
+
+    cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head
+       4968 10.3390
+       1261 10.1080
+        687 10.23668
+        663 10.1021
+        472 10.1088
+        468 10.4000
+        367 10.3917
+        357 10.1364
+        308 10.4230
+        303 10.17863
+
+    cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr
+      19496 crossref
+       4836 datacite
+        101 null
+
+    cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr
+       9575 longtail
+       8419 null
+       3861 society
+        822 unipress
+        449 oa
+        448 scielo
+        430 commercial
+        400 repository
+         22 other
+          7 archive
+
+    cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head
+       4871 MDPI AG
+       1107 Informa UK (Taylor & Francis)
+        665 EAG-Publikationen
+        631 American Chemical Society
+        451 IOP Publishing
+        357 The Optical Society
+        347 OpenEdition
+        309 CAIRN
+        308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik
+        303 Apollo - University of Cambridge Repository
+
+    cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head
+       4908 null
+        378 Sustainability
+        327 ACS Omega
+        289 Optics Express
+        271 International Journal of Environmental Research and Public Health
+        270 International Journal of Health Sciences
+        238 Sensors
+        223 International Journal of Molecular Sciences
+        207 Molecules
+        193 Proceedings of the National Academy of Sciences of the United States of America
+
+    cat recent_missing_oa.json \
+        | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+        | wc -l
+    16558
+
+    cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r
+    10.3390/molecules27144419
+        => was a 404
+        => recrawl was successful
+    10.3390/math10142398
+        => was a 404
+    10.3390/smartcities5030039
+        => was a 404
+
+Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation.
+Could be just a fatcat script, or a sandcrawler query.
+
+    cat recent_missing_oa.json \
+        | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+        | shuf -n10 | jq .doi -r
+
+    https://doi.org/10.18452/24860
+        => success (just needed quarterly retry?)
+        => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki)
+        => current status is "bad-redirect"
+    https://doi.org/10.26181/20099540.v1
+        => success
+        => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30
+        => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540
+    https://doi.org/10.4230/lipics.sea.2022.22
+        => there is a bug resulting in trailing slash in `citation_pdf_url`
+        => fixed as a quirks mode
+        => emailed to report
+    https://doi.org/10.3897/aca.5.e89679
+        => success
+        => e6fd1e066c8a323dc56246631748202d5fb48808
+        => current status is 'bad-redirect'
+    https://doi.org/10.1103/physrevd.105.115035
+        => was 404
+        => success after force-recrawl of the terminal URL (not base URL)
+    https://doi.org/10.1155/2022/4649660
+        => was 404
+        => success after force-recrawl (of base_url)
+    https://doi.org/10.1090/spmj/1719
+        => paywall (not actually OA)
+        => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA?
+    https://doi.org/10.1139/as-2022-0011
+        => was no-pdf-link
+        => fixed fulltext URL extraction
+        => still needed to re-crawl terminal PDF link? hrm
+    https://doi.org/10.31703/grr.2022(vii-ii).02
+        => was no-pdf-link
+        => fixed! success
+    https://doi.org/10.1128/spectrum.00154-22
+        => was 404
+        => now repeatably 503, via SPN
+    https://doi.org/10.51601/ijersc.v3i3.393
+        => 503 server error
+    https://doi.org/10.25416/ntr.20137379.v1
+        => is figshare
+        => docx (not PDF)
+    https://doi.org/10.25394/pgs.20263698.v1
+        => figshare
+        => embargo'd
+    https://doi.org/10.24850/j-tyca-14-4-7
+        => was no-pdf-link
+        => docs.google.com/viewer (!)
+        => now handle this (success)
+    https://doi.org/10.26267/unipi_dione/1832
+        => was bad-redirect
+        => success
+    https://doi.org/10.25560/98019
+        => body-too-large
+        => also, PDF metadata fails to parse
+        => is actually like 388 MByte
+    https://doi.org/10.14738/abr.106.12511
+        => max-hops-exceeded
+        => bumped max-hops from 6 to 8
+        => then success (via google drive)
+    https://doi.org/10.24350/cirm.v.19933803
+        => video, not PDF
+    https://doi.org/10.2140/pjm.2022.317.67
+        => link-loop
+        => not actually OA
+    https://doi.org/10.26265/polynoe-2306
+        => was bad-redirect
+        => now success
+    https://doi.org/10.3389/fpls.2022.826875
+        => frontiers
+        => was terminal-bad-status (403)
+        => success on retry (not sure why)
+        => maybe this is also a date-of-publication thing?
+        => not sure all these should be retried though
+    https://doi.org/10.14198/medcom.22240
+        => was terminal-bad-status (404)
+        => force-recrawl resulted in an actual landing page, but still no-pdf-link
+        => but actual PDF is a real 404, it seems. oh well
+    https://doi.org/10.31729/jnma.7579
+        => no-capture
+    https://doi.org/10.25373/ctsnet.20146931.v2
+        => figshare
+        => video, not document or PDF
+    https://doi.org/10.1007/s42600-022-00224-0
+        => not yet crawled/attempted (!)
+        => springer
+        => not actually OA
+    https://doi.org/10.37391/ijeer.100207
+        => some upstream issue (server not found)
+    https://doi.org/10.1063/5.0093946
+        => aip.scitation.org, is actually OA (can download in browser)
+        => cookie trap?
+        => redirect-loop (seems like a true redirect loop)
+        => retrying the terminal PDF URL seems to have worked
+    https://doi.org/10.18502/jchr.v11i2.9998
+        => no actual fulltext on publisher site
+    https://doi.org/10.1128/spectrum.01144-22
+        => this is a 503 error, even after retrying. weird!
+
+DONE: check `publisher_type` in chocula for:
+- "MDPI AG"
+- "Informa UK (Taylor & Francis)"
+
+    cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40
+       4819 ["MDPI AG","longtail"]
+        924 ["Informa UK (Taylor & Francis)",null]
+        665 ["EAG-Publikationen",null]
+        631 ["American Chemical Society","society"]
+        449 ["IOP Publishing","society"]
+        357 ["The Optical Society","society"]
+        336 ["OpenEdition","oa"]
+        309 ["CAIRN","repository"]
+        308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null]
+        303 ["Apollo - University of Cambridge Repository",null]
+        292 ["Springer (Biomed Central Ltd.)",null]
+        275 ["Purdue University Graduate School",null]
+        270 ["Suryasa and Sons","longtail"]
+        257 ["La Trobe",null]
+        216 ["Frontiers Media SA","longtail"]
+        193 ["Proceedings of the National Academy of Sciences","society"]
+        182 ["Informa UK (Taylor & Francis)","longtail"]
+        176 ["American Physical Society","society"]
+        168 ["Institution of Electrical Engineers","society"]
+        166 ["Oxford University Press","unipress"]
+        153 ["Loughborough University",null]
+
+    chocula mostly seems to set these correctly. is the issue that the chocula
+    computed values aren't coming through or getting updated? probably. both
+    the release (from container) metadata update; and chocula importer not
+    doing updates based on this field; and some old/incorrect values.
+
+    did some cleanups of specific containers, and next chocula update should
+    result in a bunch more `publisher_type` getting populated on older
+    containers
+
+
+TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline
+
+TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?)
+    doi_prefix:10.26181
+
+WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?)
+    doi_prefix:10.3390 (MDPI)
+    doi_prefix:10.1103
+    doi_prefix:10.1155
+
+DONE: simply re-ingest all:
+    doi_prefix:10.4230
+        ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230'
+        # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096})
+    container_65lzi3vohrat5nnymk3dqpoycy
+        ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy
+        # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187})
+    container_5vp2bio65jdc3blx6rfhp3chde
+        ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde
+        # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83})
+
+DONE: verify and maybe re-ingest all:
+    is_oa:true publisher:"Canadian Science Publishing" in_ia:false
+
+    ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print'
+    # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041})
+
+
+## Re-Ingest bad-redirect, max-hops-exceeded, and google drive
+
+Similar to `redirect-loop`:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.status = 'bad-redirect'
+            -- AND ingest_request.ingest_type = 'pdf'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json';
+    # COPY 100011
+    # after first run: COPY 5611
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.status = 'max-hops-exceeded'
+            -- AND ingest_request.ingest_type = 'pdf'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json';
+    # COPY 3546
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.hit is false
+            AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%'
+            AND (
+                ingest_request.link_source = 'doi'
+                OR ingest_request.link_source = 'doaj'
+                OR ingest_request.link_source = 'unpaywall'
+            )
+    ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json';
+    # COPY 1082
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json
+
+    cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+    cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+    cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+    # DONE
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'dblp'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |        status         | count  
+    -------------+-----------------------+--------
+     pdf         | success               | 305142
+     pdf         | no-pdf-link           | 192683
+     pdf         | no-capture            |  42634
+     pdf         | terminal-bad-status   |  38041
+     pdf         | skip-url-blocklist    |  31055
+     pdf         | link-loop             |   9263
+     pdf         | wrong-mimetype        |   4545
+     pdf         | redirect-loop         |   3952
+     pdf         | empty-blob            |   2705
+     pdf         | wayback-content-error |    834
+     pdf         | wayback-error         |    294
+     pdf         | petabox-error         |    202
+     pdf         | blocked-cookie        |    155
+     pdf         | cdx-error             |    115
+     pdf         | body-too-large        |     66
+     pdf         | bad-redirect          |     19
+     pdf         | timeout               |      7
+     pdf         | bad-gzip-encoding     |      4
+    (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+    export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+    # on sandcrawler-vm
+    mkdir -p /srv/sandcrawler/tasks/doaj
+    cd /srv/sandcrawler/tasks/doaj
+    wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+    # in pipenv, in python directory
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+    # 9.72M 0:36:28 [4.44k/s]
+
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+    # 9.72M 0:17:04 [9.49k/s]
+    # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3165539
+     pdf         |                          | 2078874
+     html        |                          | 1547698
+     html        | wrong-scope              | 1114332
+     pdf         | no-pdf-link              |  517261
+     html        | success                  |  388376
+     html        | unknown-scope            |  242044
+     pdf         | no-capture               |  179030
+     pdf         | terminal-bad-status      |  174741
+     html        | no-capture               |  155323
+     pdf         | null-body                |  129267
+     pdf         | redirect-loop            |  127136
+     html        | html-resource-no-capture |  117275
+     html        | null-body                |  100296
+     pdf         | blocked-cookie           |   71093
+     html        | redirect-loop            |   65519
+     html        | terminal-bad-status      |   64856
+     html        | blocked-cookie           |   64095
+     html        | spn2-backoff             |   55173
+     pdf         | link-loop                |   27440
+     html        | wrong-mimetype           |   26016
+     html        | wayback-content-error    |   20109
+     xml         |                          |   13624
+     pdf         | wrong-mimetype           |    8411
+     xml         | success                  |    6899
+     html        | petabox-error            |    6199
+     html        | wayback-error            |    5269
+     html        | spn2-cdx-lookup-failure  |    4635
+     html        | spn2-recent-capture      |    4527
+     xml         | null-body                |    2353
+    (30 rows)
+
+## Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+    # COPY 3962331
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+    # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+     789988 www.mdpi.com
+     318142 www.frontiersin.org
+     226316 link.springer.com
+     204429 www.scielo.br
+     201175 www.sciencedirect.com
+      72852 ieeexplore.ieee.org
+      68983 dx.doi.org
+      33286 www.dovepress.com
+      26020 elifesciences.org
+      23838 www.cetjournal.it
+      21102 mab-online.nl
+      20242 www.revistas.usp.br
+      16564 periodicos.uem.br
+      15710 journals.openedition.org
+      14514 dergipark.org.tr
+      14072 apcz.umk.pl
+      13924 ojs.minions.amsterdam
+      13717 bmgn-lchr.nl
+      13512 ojstest.minions.amsterdam
+      10440 journals.asm.org
+
+Bulk ingest:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # Done
+
+## Stats Again
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 4704006
+     html        | wrong-scope              | 1761227
+     html        | success                  |  778165
+     pdf         | no-pdf-link              |  759805
+     html        | no-capture               |  382080
+     html        | unknown-scope            |  313391
+     html        | html-resource-no-capture |  292953
+     pdf         | no-capture               |  290311
+     pdf         | terminal-bad-status      |  271776
+     pdf         | null-body                |  129267
+     pdf         | blocked-cookie           |  108491
+     html        | terminal-bad-status      |  103014
+     html        | null-body                |  100296
+     html        | blocked-cookie           |   88533
+     pdf         |                          |   81517
+     pdf         | skip-url-blocklist       |   76443
+     html        | spn2-backoff             |   50615
+     pdf         | link-loop                |   45516
+     html        | wrong-mimetype           |   33525
+     html        | wayback-content-error    |   25535
+     pdf         | empty-blob               |   21431
+     pdf         | redirect-loop            |   19795
+     html        | petabox-error            |   18291
+     html        | empty-blob               |   14391
+     pdf         | wrong-mimetype           |   14084
+     html        | redirect-loop            |   12856
+     xml         | success                  |   10381
+     xml         | no-capture               |   10008
+     html        | skip-url-blocklist       |    3294
+     html        | cdx-error                |    3275
+    (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+    export PATCHDATE=2022-07-29
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+    COPY (
+        SELECT row_to_json(t) FROM (
+            SELECT ingest_file_result.terminal_url, ingest_request.*
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                (
+                    ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'html'
+                )
+                -- AND ingest_file_result.updated >= '2022-01-12'
+                AND (
+                    ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'wayback-content-error'
+                    OR ingest_file_result.status = 'petabox-error'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR (
+                        ingest_file_result.status = 'terminal-bad-status'
+                        AND (
+                            ingest_file_result.terminal_status_code = 500
+                            OR ingest_file_result.terminal_status_code = 502
+                            OR ingest_file_result.terminal_status_code = 503
+                            OR ingest_file_result.terminal_status_code = 429
+                        )
+                    )
+                )
+                AND (
+                    ingest_request.link_source = 'doi'
+                    OR ingest_request.link_source = 'doaj'
+                    OR ingest_request.link_source = 'dblp'
+                    OR ingest_request.link_source = 'arxiv'
+                    OR ingest_request.link_source = 'pmc'
+                    -- OR ingest_request.link_source = 'unpaywall'
+                    -- OR ingest_request.link_source = 'oai'
+                )
+
+                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+        ) t
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+    => COPY 3524573
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+        | rg -v "\\\\" \
+        | jq -r .terminal_url \
+        | rg '://' \
+        | rg -i '^http' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+    => 3.11M 0:01:08 [45.4k/s]
+
+    # check top domains
+    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+     624948 doi.org
+     382492 www.jstage.jst.go.jp
+     275087 www.mdpi.com
+     157134 www.persee.fr
+     108979 www.sciencedirect.com
+      94375 www.scielo.br
+      50834 onlinelibrary.wiley.com
+      49991 journals.lww.com
+      30354 www.frontiersin.org
+      27963 doaj.org
+      27058 www.e-periodica.ch
+      24147 dl.acm.org
+      23389 aclanthology.org
+      22086 www.research-collection.ethz.ch
+      21589 medien.die-bonn.de
+      18866 www.ingentaconnect.com
+      18583 doi.nrct.go.th
+      18271 repositories.lib.utexas.edu
+      17634 hdl.handle.net
+      16366 archives.datapages.com
+      15146 cgscholar.com
+      13987 dl.gi.de
+      13188 www.degruyter.com
+      12503 ethos.bl.uk
+      12304 preprints.jmir.org
+
+    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+    => done
+
+    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+    => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+    cd /srv/sandcrawler/tasks/
+    wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+    cd /srv/sandcrawler/src/python
+    git log | head -n1
+    # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+    pipenv shell
+    zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+        | ./scripts/oai2ingestrequest.py - \
+        | pv -l \
+        | gzip \
+        > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+    # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+    head oai_type_counts.txt -n30
+    5623867 info:eu-repo/semantics/article
+    5334928 info:eu-repo/semantics/publishedVersion
+    3870359 text
+    1240225 Text
+     829169 Article
+     769849 NonPeerReviewed
+     665700 PeerReviewed
+     648740 Peer-reviewed Article
+     547857 article
+     482906 info:eu-repo/semantics/bachelorThesis
+     353814 Thesis
+     329269 Student thesis
+     262650 info:eu-repo/semantics/conferenceObject
+     185354 Journal articles
+     162021 info:eu-repo/semantics/doctoralThesis
+     152079 Journal Article
+     150226 Research Article
+     130217 Conference papers
+     127255 Artículo revisado por pares
+     124243 Newspaper
+     123908 ##rt.metadata.pkp.peerReviewed##
+     123309 Photograph
+     122981 info:eu-repo/semantics/masterThesis
+     116719 Book
+     108946 Image
+     108216 Report
+     107946 Other
+     103562 masterThesis
+     103038 info:eu-repo/semantics/other
+     101404 StillImage
+    [...]
+
+And formats:
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+    head -n 20 oai_format_counts.txt 
+    11151928 application/pdf
+     677413 text
+     561656 text/html
+     498518 image/jpeg
+     231219 Text
+     193638 text/xml
+     147214 Image
+     117073 image/jpg
+     110872 pdf
+      91323 image/tiff
+      76948 bib
+      75393 application/xml
+      70244 Digitized from 35 mm. microfilm.
+      68206 mods
+      59227 PDF
+      57677 application/epub+zip
+      57602 application/octet-stream
+      52072 text/plain
+      51620 application/msword
+      47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+    # 20,840,301
+
+Next load in to sandcrawler DB:
+
+    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+    Traceback (most recent call last):
+      File "./persist_tool.py", line 311, in <module>
+        main()
+      File "./persist_tool.py", line 307, in main
+        args.func(args)
+      File "./persist_tool.py", line 119, in run_ingest_request
+        pusher.run()
+      File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+        self.worker.push_batch(batch)
+      File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+        resp = self.db.insert_ingest_request(self.cur, irequests)
+      File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+        resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+      File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+        cur.execute(b''.join(parts))
+        psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+        DETAIL:  Index row references tuple (6893121,3) in relation "ingest_request".
+        HINT:  Values larger than 1/3 of a buffer page cannot be indexed.
+        Consider a function index of an MD5 hash of the value, or use full text indexing.
+    15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+        | tac \
+        | head -n1000000 \
+        | pv -l \
+        | ./persist_tool.py ingest-request -
+    # 1.00M 0:03:04 [5.41k/s]
+    # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+             status          |  count
+    -------------------------+---------
+                             | 6398455
+     success                 |  540219
+     no-pdf-link             |   41316
+     link-loop               |   23871
+     no-capture              |   11350
+     redirect-loop           |    8315
+     wrong-mimetype          |    2394
+     terminal-bad-status     |    1540
+     null-body               |    1038
+     cdx-error               |     272
+     empty-blob              |     237
+     petabox-error           |     213
+     wayback-error           |     186
+     blocked-cookie          |     107
+     timeout                 |      47
+     wayback-content-error   |      26
+     spn2-cdx-lookup-failure |      21
+     skip-url-blocklist      |      16
+     spn2-backoff            |      15
+     body-too-large          |      13
+    (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2022-09-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+    # COPY 6398455
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+        | pv -l \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+    # 6.40M 0:02:18 [46.2k/s]
+
+    cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count  
+    -------------------------+---------
+     no-capture              | 3617175
+     success                 | 2775036
+     no-pdf-link             |  449298
+     link-loop               |   74260
+     terminal-bad-status     |   47819
+     wrong-mimetype          |   20195
+     redirect-loop           |   18197
+     empty-blob              |   12127
+     cdx-error               |    3038
+     skip-url-blocklist      |    2630
+     wayback-error           |    2599
+     petabox-error           |    2354
+     wayback-content-error   |    1617
+     blocked-cookie          |    1293
+     null-body               |    1038
+     body-too-large          |     670
+                             |     143
+     bad-gzip-encoding       |      64
+     timeout                 |      47
+     spn2-cdx-lookup-failure |      20
+    (20 rows)
+
+
+## Crawl Seedlist
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2022-09-01'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'redirect-loop'
+                OR ingest_file_result.status = 'terminal-bad-status'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'timeout'
+                OR ingest_file_result.status = 'wayback-content-error'
+            )
+    ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+    => COPY 3692846
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | pv -l \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+    => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | jq .base_url -r \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+    => 3.66M 0:00:59 [61.8k/s]
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | rg '"terminal_url"' \
+        | jq -r .result.terminal_url \
+        | rg -v ^null$ \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+    => 0.00  0:00:05 [0.00 /s]
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+        | awk '{print "F+ " $1}' \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+        | sort -u -S 4G \
+        | cut -d/ -f3 \
+        | sort \
+        | uniq -c \
+        | sort -nr \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+    head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+      91899 raco.cat
+      70116 islandora.wrlc.org
+      68708 urn.kb.se
+      63726 citeseerx.ist.psu.edu
+      50370 publications.rwth-aachen.de
+      44885 urn.nsk.hr
+      38429 server15795.contentdm.oclc.org
+      33041 periodicos.ufpb.br
+      32519 nbn-resolving.org
+      31990 www.ajol.info
+      24745 hal.archives-ouvertes.fr
+      22569 id.nii.ac.jp
+      17239 tilburguniversity.on.worldcat.org
+      15873 dspace.nbuv.gov.ua
+      15436 digitalcommons.wustl.edu
+      14885 www.iiste.org
+      14623 www.manchester.ac.uk
+      14033 nbn-resolving.de
+      13999 opus4.kobv.de
+      13689 www.redalyc.org
+
+Sizes:
+
+    wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+            0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+    # as regular user
+    scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+    # ran 2022-11-16, after crawl cleanup
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => DONE
+
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+            status         |  count
+    -----------------------+---------
+     success               | 4721164    +1,946,128
+     no-pdf-link           | 1116290
+     no-capture            |  673939
+     terminal-bad-status   |  232217
+     link-loop             |  148544
+     wrong-mimetype        |   68841
+     redirect-loop         |   26262
+     empty-blob            |   17759
+     cdx-error             |    6570
+     blocked-cookie        |    4026
+     blocked-wall          |    3054
+     skip-url-blocklist    |    2924
+     body-too-large        |    2404
+     bad-redirect          |    1565
+     wayback-error         |    1320
+     petabox-error         |    1083
+     null-body             |    1038
+     wayback-content-error |     264
+     bad-gzip-encoding     |     150
+                           |     143
+    (20 rows)
+
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain != ''
+            AND t1.status != 'success'
+            AND t1.status != 'no-capture'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC
+        LIMIT 30;
+
+Status overview for a particular domain:
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+        FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+            AND t1.terminal_status_code is not null
+        GROUP BY domain, terminal_status_code
+        ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%osapublishing.org%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+    this publisher (The Optical Society) is systemically using a CAPTCHA to
+    gate access to PDFs.  bummer! could ask them to white-list?
+
+    has citation_pdf_url, so that isn't an issue
+
+    status: "no-pdf-link"
+    hops:
+        "https://doi.org/10.1364/optica.6.000798",
+        "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+        "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+            domain         |       status        | count 
+    -----------------------+---------------------+-------
+     www.osapublishing.org | no-capture          | 16680
+     www.osapublishing.org | no-pdf-link         |   373
+     www.osapublishing.org | redirect-loop       |    19
+     www.osapublishing.org | terminal-bad-status |     5
+     www.osapublishing.org | cdx-error           |     1
+     www.osapublishing.org | wrong-mimetype      |     1
+     www.osapublishing.org | spn-error           |     1
+     www.osapublishing.org | success             |     1
+     www.osapublishing.org | wayback-error       |     1
+    (9 rows)
+
+www.persee.fr
+
+    Seems to be mostly blocking or rate-limiting?
+
+        domain     |               status                | count
+    ---------------+-------------------------------------+-------
+     www.persee.fr | no-capture                          | 37862
+     www.persee.fr | terminal-bad-status                 |  3134
+     www.persee.fr | gateway-timeout                     |  2828
+     www.persee.fr | no-pdf-link                         |   431
+     www.persee.fr | spn-error                           |    75
+     www.persee.fr | redirect-loop                       |    23
+     www.persee.fr | success                             |     8
+     www.persee.fr | spn2-error                          |     2
+     www.persee.fr | spn2-error:soft-time-limit-exceeded |     1
+     www.persee.fr | wrong-mimetype                      |     1
+    (10 rows)
+
+journals.openedition.org
+
+    PDF access is via "freemium" subscription. Get redirects to:
+
+        https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+    Content is technically open access (HTML and license; for all content?),
+    but can't be crawled as PDF without subscription.
+
+              domain          |         status          | count 
+    --------------------------+-------------------------+-------
+     journals.openedition.org | redirect-loop           | 29587
+     journals.openedition.org | success                 |  6821
+     journals.openedition.org | no-pdf-link             |  1507
+     journals.openedition.org | no-capture              |   412
+     journals.openedition.org | wayback-error           |    32
+     journals.openedition.org | wrong-mimetype          |    27
+     journals.openedition.org | terminal-bad-status     |    13
+     journals.openedition.org | spn2-cdx-lookup-failure |     4
+     journals.openedition.org | spn-remote-error        |     1
+     journals.openedition.org | null-body               |     1
+     journals.openedition.org | cdx-error               |     1
+    (11 rows)
+
+journals.lww.com
+
+    no-pdf-link
+
+          domain      |     status     | count 
+    ------------------+----------------+-------
+     journals.lww.com | no-pdf-link    | 11668
+     journals.lww.com | wrong-mimetype |   131
+    (2 rows)
+
+    doi prefix: 10.1097
+
+    <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+    data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+    Some weird thing going on, maybe they are blocking-via-redirect based on
+    our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+    no-pdf-link
+
+koreascience.or.kr          | no-pdf-link         |   8867
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%osapublishing.org%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+www.cairn.info              | link-loop           |   8717
+
+easy.dans.knaw.nl           | no-pdf-link         |   8262
+scielo.conicyt.cl           | no-pdf-link         |   7925
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'scielo.conicyt.cl'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+
+          domain       |       status        | count 
+    -------------------+---------------------+-------
+     scielo.conicyt.cl | no-pdf-link         |  7926
+     scielo.conicyt.cl | success             |  4972
+     scielo.conicyt.cl | terminal-bad-status |  1474
+     scielo.conicyt.cl | wrong-mimetype      |     6
+     scielo.conicyt.cl | no-capture          |     4
+     scielo.conicyt.cl | null-body           |     1
+
+
+     pdf         | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 |                  200 | 
+
+    These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+    pdf         | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 |                  200 | 
+    pdf         | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 |                  200 | 
+    pdf         | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00  | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 |                  200 | 
+
+    Look like web/xml only.
+
+    TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr               | no-pdf-link         |   6842
+www.m-hikari.com            | no-pdf-link         |   6763
+cshprotocols.cshlp.org      | no-pdf-link         |   6553
+www.bibliotekevirtual.org   | no-pdf-link         |   6309
+data.hpc.imperial.ac.uk     | no-pdf-link         |   6071
+projecteuclid.org           | link-loop           |   5970
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'projecteuclid.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%projecteuclid.org%'
+            AND status = 'link-loop'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+          domain       |         status          | count 
+    -------------------+-------------------------+-------
+     projecteuclid.org | link-loop               |  5985
+     projecteuclid.org | success                 |    26
+     projecteuclid.org | wayback-error           |    26
+     projecteuclid.org | wrong-mimetype          |    17
+     projecteuclid.org | spn2-cdx-lookup-failure |     4
+     projecteuclid.org | other-mimetype          |     4
+     projecteuclid.org | no-capture              |     3
+     projecteuclid.org | terminal-bad-status     |     2
+     projecteuclid.org | spn2-error:job-failed   |     1
+     projecteuclid.org | spn-remote-error        |     1
+    (10 rows)
+
+    Doing a cookie check and redirect.
+
+    TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br               | no-pdf-link         |   5823
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'www.scielo.br'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%www.scielo.br%'
+            AND status = 'no-pdf-link'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+        domain     |         status          | count 
+    ---------------+-------------------------+-------
+     www.scielo.br | success                 | 35150
+     www.scielo.br | no-pdf-link             |  5839
+     www.scielo.br | terminal-bad-status     |   429
+     www.scielo.br | no-capture              |   189
+     www.scielo.br | wrong-mimetype          |     7
+     www.scielo.br | spn2-cdx-lookup-failure |     2
+    (6 rows)
+
+    Seems to just be the subset with no PDFs.
+
+get.iedadata.org            | no-pdf-link         |   5822
+www.pdcnet.org              | no-pdf-link         |   5798
+publications.rwth-aachen.de | no-pdf-link         |   5323
+www.sciencedomain.org       | no-pdf-link         |   5231
+medicalforum.ch             | terminal-bad-status |   4574
+jrnl.nau.edu.ua             | link-loop           |   4145
+ojs.academypublisher.com    | no-pdf-link         |   4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop       | 240967
+  dialnet.unirioja.es | terminal-bad-status |  20320
+    => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link |  72639
+    => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop |  42859
+- www.redalyc.org:9081 | no-pdf-link |  10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop |  27328
+    => blocking is pretty aggressive, using cookies or referrer or something.
+       maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
index 52a3264..cd8176e 100644
--- a/notes/tasks/2021-09-09_pdf_url_lists.md
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928
 "Parsed web PDFs": `file_meta`, left join CDX
 
 (didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
index 65e9fe3..5fb69d1 100644
--- a/notes/tasks/2021-12-06_regrobid.md
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -191,6 +191,84 @@ And some earlier files of interest on `aitio`:
         | pv -l \
         | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
 
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status_code = 200
+            AND grobid.status IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+    WITH NULL '';
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status = 'success'
+            AND grobid.grobid_version NOT LIKE '0.7.%'
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+    WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+    cd /srv/sandcrawler/tasks/
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+        | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
 ## General Counts
 
 How many fatcat files of what mimetype (reported in sandcrawler-db)?
@@ -287,3 +365,16 @@ What are the GROBID status codes for fatcat files? Narrowed down:
      error          |         200 |        3
     (7 rows)
 
+Ran the same query again on 2021-12-15:
+
+         status     | status_code |  count   
+    ----------------+-------------+----------
+     success        |         200 | 45092915
+     error          |         500 |   302373
+                    |             |   250335
+                    |         200 |    53352
+     bad-grobid-xml |         200 |       39
+     error-timeout  |          -4 |       37
+     error          |         200 |       34
+     error          |         503 |        2
+    (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM cdx
+        LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+    WITH NULL '';
+    => COPY 8801527
+
+    cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+    (country_code:ua OR lang:uk)
+    => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+       later in day, already some 22k missing found! wow
+    => 2022-04-04, after ingests:  476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+    x  wikidata SPARQL update
+    x  chocula run
+    x  journal metadata update (fatcat)
+    x  update journal stats (fatcat extra)
+- DOAJ article metadata import
+    x  prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+    python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv
+    1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+    select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+    1952
+
+    SELECT COUNT(*) FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+    => 1970
+
+    .mode csv
+    .once homepage_urls_ukraine.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+
+    .mode csv
+    .once homepage_urls_russia.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ru'
+        OR journal.lang = 'ru'
+        OR journal.name like '%russ%'
+        OR journal.publisher like '%russ%';
+
+    .mode csv
+    .once homepage_urls_belarus.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'by'
+        OR journal.lang = 'be'
+        OR journal.name like '%belarus%'
+        OR journal.publisher like '%belarus%';
+
+    cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv 
+        1550 homepage_urls.2022-03-08.ua_tld.tsv
+        1971 homepage_urls_ukraine.tsv
+        3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+        3728 homepage_urls_russia.tsv
+        2420 homepage_urls.2022-03-08.ru_tld.tsv
+        6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+    cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+        138 homepage_urls_belarus.tsv
+        85 homepage_urls.2022-03-08.by_tld.tsv
+        222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+    => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+    => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+    ./fatcat_ingest.py --env prod \
+        --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk"
+
+    # around Tue 08 Mar 2022 01:07:37 PM PST
+    # Expecting 185659 release objects in search queries
+    # didn't complete successfully? hrm
+
+    # ok, retry "manually" (with kafkacat)
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+    # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+    # 103k 0:25:04 [68.7 /s]
+
+    zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+    # 103k 0:00:02 [38.1k/s]
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:by OR lang:be" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+    # Expecting 2266 release objects in search queries
+    # 1.29k 0:00:34 [37.5 /s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ru OR lang:ru" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+    # Expecting 1515246 release objects in search queries
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+    # 309k 0:00:03 [81.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+    # 71.2k 0:00:03 [19.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+    # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+    zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 103k 0:00:02 [36.1k/s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 1.29k 0:00:00 [15.8k/s]
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+    export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+    COPY (
+        SELECT
+            terminal_sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+            ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+        FROM ingest_file_result
+        WHERE
+            ingest_type = 'pdf'
+            AND status = 'success'
+            AND hit = true
+        ORDER BY terminal_sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+    COPY (
+        SELECT
+            cdx.sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+            ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+        FROM cdx
+        LEFT JOIN file_meta
+        ON
+            cdx.sha1hex = file_meta.sha1hex
+        WHERE
+            file_meta.mimetype = 'application/pdf'
+            OR (
+                file_meta.mimetype IS NULL
+                AND cdx.mimetype = 'application/pdf'
+            )
+        ORDER BY cdx.sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by 
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+    -m collection:ia_biblio_metadata \
+    -m title:"IA Scholarly URLs ($TASKDATE)" \
+    -m date:$TASKDATE \
+    -m creator:"Internet Archive Web Group" \
+    -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+    /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+    SELECT COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+    => 589
+
+    SELECT ingest_file_result.status, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY ingest_file_result.status;
+
+         status     | count 
+    ----------------+-------
+     cdx-error      |     1
+     success        |   587
+     wrong-mimetype |     1
+    (3 rows)
+
+
+    SELECT grobid.status_code, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY grobid.status_code;
+
+     status_code | count 
+    -------------+-------
+             200 |   385
+             500 |   202
+                 |     2
+    (3 rows)
+
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 500
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+    => COPY 202
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 200
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+    => COPY 385
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    | jq '. + {force_recrawl: true}' -c \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n100 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n10000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1