14 files changed, 1593 insertions, 2 deletions
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+    https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+    https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+    https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+    sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+                     sha1hex                  |            updated            | grobid_version | status_code |     status     | fatcat_release |                                metadata
+    ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+     d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 |                |         200 | error          |                | {"error_msg": "response XML too large: 12052192 bytes"}
+     8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 |                |         200 | error          |                | {"error_msg": "response XML too large: 18758248 bytes"}
+     227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+        FIXED
+     f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+        https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+        FIXED
+     c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "mismatched tag: line 198, column 3"}
+        https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+        FIXED (and good)
+     4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "unclosed token: line 812, column 7"}
+        https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+        FIXED
+        metadata quality mixed, but complex document (?)
+     7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+        https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+        FIXED
+     088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+        https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+        FIXED
+     19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+        not found
+     acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+        https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+        BROKEN: not well-formed (invalid token): line 60, column 45
+            <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+     8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+        not found
+     c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+        https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+        BROKEN: not well-formed (invalid token): line 58, column 45
+            <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+     840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+        not found
+     3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+        not found
+     f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+        https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+        FIXED, good
+     f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+        FIXED
+     37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+        https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+        FIXED
+     3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 |                |         200 | bad-grobid-xml |                | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+        https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+        FIXED
+    (21 rows)
+
+Some other errors from other queries:
+
+     d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 |                |         500 | error  |                | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+        https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+        FIXED: with 0.7.0+
+
+     56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 |                |         500 | error  |                | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+        https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+        still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+        BAD PDF ("no pages" in evince)
+
+     d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 |                |         500 | error  |                | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+        https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+        FIXED
+
+     51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00  |                |         503 | error  |                | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+        https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+        FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+    .data.id
+    .data.latestVersion.datasetPersistentId
+    .data.latestVersion.versionNumber, .versionMinorNumber
+    .data.latestVersion.files[]
+        .dataFile
+            .contentType (mimetype)
+            .filename
+            .filesize (int, bytes)
+            .md5
+            .persistendId
+            .description
+        .label (filename?)
+        .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+    live web
+    no longer active
+    http://www.dlib.org/back.html
+
+NLM technical bulletin
+    https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+    https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+    live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+    issn:2551-1289
+    vanished
+    blog format
+    http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+    https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+    https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+    ISSN: 1529-3041
+
+    defunct since ~2010
+    simple HTML articles
+    references
+    http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+    http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+    PDF articles (not HTML)
+    http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+    zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+    => 2,184,954
+
+    cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+     254817 link.springer.com
+     145159 www.scielo.br
+      78044 journal.frontiersin.org
+      77394 www.frontiersin.org
+      40849 www.dovepress.com
+      19024 dergipark.org.tr
+      18758 periodicos.ufsc.br
+      16346 www.revistas.usp.br
+      15872 revistas.unal.edu.co
+      15527 revistas.ucm.es
+      13669 revistas.usal.es
+      12640 dergipark.gov.tr
+      12111 journals.rudn.ru
+      11839 www.scielosp.org
+      11277 www.karger.com
+      10827 www.journals.vu.lt
+      10318 
+       9854 peerj.com
+       9100 ojs.unud.ac.id
+       8581 jurnal.ugm.ac.id
+       8261 riviste.unimi.it
+       8012 journals.uran.ua
+       7454 revistas.pucp.edu.pe
+       7264 journals.vgtu.lt
+       7200 publicaciones.banrepcultural.org
+
+    cat html_fulltext_urls.txt \
+        | rg -v link.springer.com \
+        | rg -v scielo \
+        | rg -v dergipark.gov.tr \
+        | rg -v frontiersin.org \
+        > html_fulltext_urls.filtered.txt
+    => 1,579,257
+
+    zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+    => 560k
+
+    cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+      40849 www.dovepress.com
+      10570 journals.rudn.ru
+      10494 dergipark.org.tr
+      10233 revistas.unal.edu.co
+       9981 dergipark.gov.tr
+       9428 revistas.usal.es
+       8292 revistas.ucm.es
+       7200 publicaciones.banrepcultural.org
+       6953 revistas.pucp.edu.pe
+       6000 www.scielosp.org
+       5962 www.scielo.br
+       5621 www.richtmann.org
+       5123 scielo.sld.cu
+       5067 ojs.unud.ac.id
+       4838 periodicos.ufsc.br
+       4736 revistasonlinepre.inap.es
+       4486 journal.fi
+       4221 www.seer.ufu.br
+       3553 revistas.uam.es
+       3492 revistas.pucsp.br
+       3060 www.scielo.org.co
+       2991 scielo.isciii.es
+       2802 seer.ufrgs.br
+       2692 revistas.unc.edu.ar
+       2685 srl.si
+
+    cat html_fulltext_urls.no_doi.txt \
+        | rg -v link.springer.com \
+        | rg -v scielo \
+        | rg -v dergipark.gov.tr \
+        | rg -v frontiersin.org \
+        > html_fulltext_urls.no_doi.filtered.txt
+    => 518,608
+
+    zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+        https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+        https://journal.umy.ac.id/index.php/st/article/view/3297
+        https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+        http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+        http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+        https://journal.fi/inf/article/view/59430
+        http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+        https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+        https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+        http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+        http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+        http://journal.bdfish.org/index.php/fisheries/article/view/91
+        https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+        https://www.lithosphere.ru/jour/article/view/779
+        https://journals.hioa.no/index.php/seminar/article/view/2412
+        http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+        https://www.kmuj.kmu.edu.pk/article/view/15698
+        http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+        https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+        http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
index fded7b3..ac808dd 100644
--- a/notes/ingest/2021-09-02_oai_pmh_patch.md
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH
 harvest/transform time and filter them out.
 
 It may be worthwhile to attempt ingest of multiple existing captures
-(timestamps) in the ingest pipeline.  Eg, isntead of chosing a single "best"
-capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(timestamps) in the ingest pipeline.  Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
 (or at least a couple).  This is because repository software gets upgraded, so
 old "no-capture" or "not found" or "link loop" type captures may work when
 recrawled.
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+    zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+        | rg -v 'oai:kb.dk:' \
+        | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+        | rg -v 'oai:hispana.mcu.es:' \
+        | rg -v 'oai:bnf.fr:' \
+        | rg -v 'oai:ukm.si:' \
+        | rg -v 'oai:biodiversitylibrary.org:' \
+        | rg -v 'oai:hsp.org:' \
+        | rg -v 'oai:repec:' \
+        | rg -v 'oai:n/a:' \
+        | rg -v 'oai:quod.lib.umich.edu:' \
+        | rg -v 'oai:americanae.aecid.es:' \
+        | rg -v 'oai:www.irgrid.ac.cn:' \
+        | rg -v 'oai:espace.library.uq.edu:' \
+        | rg -v 'oai:edoc.mpg.de:' \
+        | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+        | rg -v 'oai:repository.erciyes.edu.tr:' \
+        | rg -v 'oai:krm.or.kr:' \
+        | ./scripts/oai2ingestrequest.py - \
+        | pv -l \
+        | gzip \
+        > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'dblp'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |        status         | count  
+    -------------+-----------------------+--------
+     pdf         | success               | 305142
+     pdf         | no-pdf-link           | 192683
+     pdf         | no-capture            |  42634
+     pdf         | terminal-bad-status   |  38041
+     pdf         | skip-url-blocklist    |  31055
+     pdf         | link-loop             |   9263
+     pdf         | wrong-mimetype        |   4545
+     pdf         | redirect-loop         |   3952
+     pdf         | empty-blob            |   2705
+     pdf         | wayback-content-error |    834
+     pdf         | wayback-error         |    294
+     pdf         | petabox-error         |    202
+     pdf         | blocked-cookie        |    155
+     pdf         | cdx-error             |    115
+     pdf         | body-too-large        |     66
+     pdf         | bad-redirect          |     19
+     pdf         | timeout               |      7
+     pdf         | bad-gzip-encoding     |      4
+    (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+    export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+    # on sandcrawler-vm
+    mkdir -p /srv/sandcrawler/tasks/doaj
+    cd /srv/sandcrawler/tasks/doaj
+    wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+    # in pipenv, in python directory
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+    # 9.72M 0:36:28 [4.44k/s]
+
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+    # 9.72M 0:17:04 [9.49k/s]
+    # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3165539
+     pdf         |                          | 2078874
+     html        |                          | 1547698
+     html        | wrong-scope              | 1114332
+     pdf         | no-pdf-link              |  517261
+     html        | success                  |  388376
+     html        | unknown-scope            |  242044
+     pdf         | no-capture               |  179030
+     pdf         | terminal-bad-status      |  174741
+     html        | no-capture               |  155323
+     pdf         | null-body                |  129267
+     pdf         | redirect-loop            |  127136
+     html        | html-resource-no-capture |  117275
+     html        | null-body                |  100296
+     pdf         | blocked-cookie           |   71093
+     html        | redirect-loop            |   65519
+     html        | terminal-bad-status      |   64856
+     html        | blocked-cookie           |   64095
+     html        | spn2-backoff             |   55173
+     pdf         | link-loop                |   27440
+     html        | wrong-mimetype           |   26016
+     html        | wayback-content-error    |   20109
+     xml         |                          |   13624
+     pdf         | wrong-mimetype           |    8411
+     xml         | success                  |    6899
+     html        | petabox-error            |    6199
+     html        | wayback-error            |    5269
+     html        | spn2-cdx-lookup-failure  |    4635
+     html        | spn2-recent-capture      |    4527
+     xml         | null-body                |    2353
+    (30 rows)
+
+## Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+    # COPY 3962331
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+    # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+     789988 www.mdpi.com
+     318142 www.frontiersin.org
+     226316 link.springer.com
+     204429 www.scielo.br
+     201175 www.sciencedirect.com
+      72852 ieeexplore.ieee.org
+      68983 dx.doi.org
+      33286 www.dovepress.com
+      26020 elifesciences.org
+      23838 www.cetjournal.it
+      21102 mab-online.nl
+      20242 www.revistas.usp.br
+      16564 periodicos.uem.br
+      15710 journals.openedition.org
+      14514 dergipark.org.tr
+      14072 apcz.umk.pl
+      13924 ojs.minions.amsterdam
+      13717 bmgn-lchr.nl
+      13512 ojstest.minions.amsterdam
+      10440 journals.asm.org
+
+Bulk ingest:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # Done
+
+## Stats Again
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 4704006
+     html        | wrong-scope              | 1761227
+     html        | success                  |  778165
+     pdf         | no-pdf-link              |  759805
+     html        | no-capture               |  382080
+     html        | unknown-scope            |  313391
+     html        | html-resource-no-capture |  292953
+     pdf         | no-capture               |  290311
+     pdf         | terminal-bad-status      |  271776
+     pdf         | null-body                |  129267
+     pdf         | blocked-cookie           |  108491
+     html        | terminal-bad-status      |  103014
+     html        | null-body                |  100296
+     html        | blocked-cookie           |   88533
+     pdf         |                          |   81517
+     pdf         | skip-url-blocklist       |   76443
+     html        | spn2-backoff             |   50615
+     pdf         | link-loop                |   45516
+     html        | wrong-mimetype           |   33525
+     html        | wayback-content-error    |   25535
+     pdf         | empty-blob               |   21431
+     pdf         | redirect-loop            |   19795
+     html        | petabox-error            |   18291
+     html        | empty-blob               |   14391
+     pdf         | wrong-mimetype           |   14084
+     html        | redirect-loop            |   12856
+     xml         | success                  |   10381
+     xml         | no-capture               |   10008
+     html        | skip-url-blocklist       |    3294
+     html        | cdx-error                |    3275
+    (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+    export PATCHDATE=2022-07-29
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+    COPY (
+        SELECT row_to_json(t) FROM (
+            SELECT ingest_file_result.terminal_url, ingest_request.*
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                (
+                    ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'html'
+                )
+                -- AND ingest_file_result.updated >= '2022-01-12'
+                AND (
+                    ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'wayback-content-error'
+                    OR ingest_file_result.status = 'petabox-error'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR (
+                        ingest_file_result.status = 'terminal-bad-status'
+                        AND (
+                            ingest_file_result.terminal_status_code = 500
+                            OR ingest_file_result.terminal_status_code = 502
+                            OR ingest_file_result.terminal_status_code = 503
+                            OR ingest_file_result.terminal_status_code = 429
+                        )
+                    )
+                )
+                AND (
+                    ingest_request.link_source = 'doi'
+                    OR ingest_request.link_source = 'doaj'
+                    OR ingest_request.link_source = 'dblp'
+                    OR ingest_request.link_source = 'arxiv'
+                    OR ingest_request.link_source = 'pmc'
+                    -- OR ingest_request.link_source = 'unpaywall'
+                    -- OR ingest_request.link_source = 'oai'
+                )
+
+                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+        ) t
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+    => COPY 3524573
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+        | rg -v "\\\\" \
+        | jq -r .terminal_url \
+        | rg '://' \
+        | rg -i '^http' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+    => 3.11M 0:01:08 [45.4k/s]
+
+    # check top domains
+    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+     624948 doi.org
+     382492 www.jstage.jst.go.jp
+     275087 www.mdpi.com
+     157134 www.persee.fr
+     108979 www.sciencedirect.com
+      94375 www.scielo.br
+      50834 onlinelibrary.wiley.com
+      49991 journals.lww.com
+      30354 www.frontiersin.org
+      27963 doaj.org
+      27058 www.e-periodica.ch
+      24147 dl.acm.org
+      23389 aclanthology.org
+      22086 www.research-collection.ethz.ch
+      21589 medien.die-bonn.de
+      18866 www.ingentaconnect.com
+      18583 doi.nrct.go.th
+      18271 repositories.lib.utexas.edu
+      17634 hdl.handle.net
+      16366 archives.datapages.com
+      15146 cgscholar.com
+      13987 dl.gi.de
+      13188 www.degruyter.com
+      12503 ethos.bl.uk
+      12304 preprints.jmir.org
+
+    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+    => done
+
+    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+    => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+    cd /srv/sandcrawler/tasks/
+    wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+    cd /srv/sandcrawler/src/python
+    git log | head -n1
+    # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+    pipenv shell
+    zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+        | ./scripts/oai2ingestrequest.py - \
+        | pv -l \
+        | gzip \
+        > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+    # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+    head oai_type_counts.txt -n30
+    5623867 info:eu-repo/semantics/article
+    5334928 info:eu-repo/semantics/publishedVersion
+    3870359 text
+    1240225 Text
+     829169 Article
+     769849 NonPeerReviewed
+     665700 PeerReviewed
+     648740 Peer-reviewed Article
+     547857 article
+     482906 info:eu-repo/semantics/bachelorThesis
+     353814 Thesis
+     329269 Student thesis
+     262650 info:eu-repo/semantics/conferenceObject
+     185354 Journal articles
+     162021 info:eu-repo/semantics/doctoralThesis
+     152079 Journal Article
+     150226 Research Article
+     130217 Conference papers
+     127255 Artículo revisado por pares
+     124243 Newspaper
+     123908 ##rt.metadata.pkp.peerReviewed##
+     123309 Photograph
+     122981 info:eu-repo/semantics/masterThesis
+     116719 Book
+     108946 Image
+     108216 Report
+     107946 Other
+     103562 masterThesis
+     103038 info:eu-repo/semantics/other
+     101404 StillImage
+    [...]
+
+And formats:
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+    head -n 20 oai_format_counts.txt 
+    11151928 application/pdf
+     677413 text
+     561656 text/html
+     498518 image/jpeg
+     231219 Text
+     193638 text/xml
+     147214 Image
+     117073 image/jpg
+     110872 pdf
+      91323 image/tiff
+      76948 bib
+      75393 application/xml
+      70244 Digitized from 35 mm. microfilm.
+      68206 mods
+      59227 PDF
+      57677 application/epub+zip
+      57602 application/octet-stream
+      52072 text/plain
+      51620 application/msword
+      47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+    # 20,840,301
+
+Next load in to sandcrawler DB:
+
+    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+    Traceback (most recent call last):
+      File "./persist_tool.py", line 311, in <module>
+        main()
+      File "./persist_tool.py", line 307, in main
+        args.func(args)
+      File "./persist_tool.py", line 119, in run_ingest_request
+        pusher.run()
+      File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+        self.worker.push_batch(batch)
+      File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+        resp = self.db.insert_ingest_request(self.cur, irequests)
+      File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+        resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+      File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+        cur.execute(b''.join(parts))
+        psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+        DETAIL:  Index row references tuple (6893121,3) in relation "ingest_request".
+        HINT:  Values larger than 1/3 of a buffer page cannot be indexed.
+        Consider a function index of an MD5 hash of the value, or use full text indexing.
+    15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+        | tac \
+        | head -n1000000 \
+        | pv -l \
+        | ./persist_tool.py ingest-request -
+    # 1.00M 0:03:04 [5.41k/s]
+    # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+             status          |  count
+    -------------------------+---------
+                             | 6398455
+     success                 |  540219
+     no-pdf-link             |   41316
+     link-loop               |   23871
+     no-capture              |   11350
+     redirect-loop           |    8315
+     wrong-mimetype          |    2394
+     terminal-bad-status     |    1540
+     null-body               |    1038
+     cdx-error               |     272
+     empty-blob              |     237
+     petabox-error           |     213
+     wayback-error           |     186
+     blocked-cookie          |     107
+     timeout                 |      47
+     wayback-content-error   |      26
+     spn2-cdx-lookup-failure |      21
+     skip-url-blocklist      |      16
+     spn2-backoff            |      15
+     body-too-large          |      13
+    (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2022-09-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+    # COPY 6398455
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+        | pv -l \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+    # 6.40M 0:02:18 [46.2k/s]
+
+    cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count  
+    -------------------------+---------
+     no-capture              | 3617175
+     success                 | 2775036
+     no-pdf-link             |  449298
+     link-loop               |   74260
+     terminal-bad-status     |   47819
+     wrong-mimetype          |   20195
+     redirect-loop           |   18197
+     empty-blob              |   12127
+     cdx-error               |    3038
+     skip-url-blocklist      |    2630
+     wayback-error           |    2599
+     petabox-error           |    2354
+     wayback-content-error   |    1617
+     blocked-cookie          |    1293
+     null-body               |    1038
+     body-too-large          |     670
+                             |     143
+     bad-gzip-encoding       |      64
+     timeout                 |      47
+     spn2-cdx-lookup-failure |      20
+    (20 rows)
+
+
+## Crawl Seedlist
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2022-09-01'
+            AND (
+                ingest_file_result.status = 'no-capture'
+                OR ingest_file_result.status = 'redirect-loop'
+                OR ingest_file_result.status = 'terminal-bad-status'
+                OR ingest_file_result.status = 'cdx-error'
+                OR ingest_file_result.status = 'petabox-error'
+                OR ingest_file_result.status = 'wayback-error'
+                OR ingest_file_result.status = 'timeout'
+                OR ingest_file_result.status = 'wayback-content-error'
+            )
+    ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+    => COPY 3692846
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | pv -l \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+    => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | jq .base_url -r \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+    => 3.66M 0:00:59 [61.8k/s]
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+        | rg '"terminal_url"' \
+        | jq -r .result.terminal_url \
+        | rg -v ^null$ \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+    => 0.00  0:00:05 [0.00 /s]
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+        | awk '{print "F+ " $1}' \
+        | shuf \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+        | sort -u -S 4G \
+        | cut -d/ -f3 \
+        | sort \
+        | uniq -c \
+        | sort -nr \
+        > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+    head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+      91899 raco.cat
+      70116 islandora.wrlc.org
+      68708 urn.kb.se
+      63726 citeseerx.ist.psu.edu
+      50370 publications.rwth-aachen.de
+      44885 urn.nsk.hr
+      38429 server15795.contentdm.oclc.org
+      33041 periodicos.ufpb.br
+      32519 nbn-resolving.org
+      31990 www.ajol.info
+      24745 hal.archives-ouvertes.fr
+      22569 id.nii.ac.jp
+      17239 tilburguniversity.on.worldcat.org
+      15873 dspace.nbuv.gov.ua
+      15436 digitalcommons.wustl.edu
+      14885 www.iiste.org
+      14623 www.manchester.ac.uk
+      14033 nbn-resolving.de
+      13999 opus4.kobv.de
+      13689 www.redalyc.org
+
+Sizes:
+
+    wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+            0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+    # as regular user
+    scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+    # ran 2022-11-16, after crawl cleanup
+    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    => DONE
+
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+        AND date(ingest_request.created) > '2022-09-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+            status         |  count
+    -----------------------+---------
+     success               | 4721164    +1,946,128
+     no-pdf-link           | 1116290
+     no-capture            |  673939
+     terminal-bad-status   |  232217
+     link-loop             |  148544
+     wrong-mimetype        |   68841
+     redirect-loop         |   26262
+     empty-blob            |   17759
+     cdx-error             |    6570
+     blocked-cookie        |    4026
+     blocked-wall          |    3054
+     skip-url-blocklist    |    2924
+     body-too-large        |    2404
+     bad-redirect          |    1565
+     wayback-error         |    1320
+     petabox-error         |    1083
+     null-body             |    1038
+     wayback-content-error |     264
+     bad-gzip-encoding     |     150
+                           |     143
+    (20 rows)
+
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain != ''
+            AND t1.status != 'success'
+            AND t1.status != 'no-capture'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC
+        LIMIT 30;
+
+Status overview for a particular domain:
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+        FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+            AND t1.terminal_status_code is not null
+        GROUP BY domain, terminal_status_code
+        ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%osapublishing.org%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+    this publisher (The Optical Society) is systemically using a CAPTCHA to
+    gate access to PDFs.  bummer! could ask them to white-list?
+
+    has citation_pdf_url, so that isn't an issue
+
+    status: "no-pdf-link"
+    hops:
+        "https://doi.org/10.1364/optica.6.000798",
+        "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+        "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+            domain         |       status        | count 
+    -----------------------+---------------------+-------
+     www.osapublishing.org | no-capture          | 16680
+     www.osapublishing.org | no-pdf-link         |   373
+     www.osapublishing.org | redirect-loop       |    19
+     www.osapublishing.org | terminal-bad-status |     5
+     www.osapublishing.org | cdx-error           |     1
+     www.osapublishing.org | wrong-mimetype      |     1
+     www.osapublishing.org | spn-error           |     1
+     www.osapublishing.org | success             |     1
+     www.osapublishing.org | wayback-error       |     1
+    (9 rows)
+
+www.persee.fr
+
+    Seems to be mostly blocking or rate-limiting?
+
+        domain     |               status                | count
+    ---------------+-------------------------------------+-------
+     www.persee.fr | no-capture                          | 37862
+     www.persee.fr | terminal-bad-status                 |  3134
+     www.persee.fr | gateway-timeout                     |  2828
+     www.persee.fr | no-pdf-link                         |   431
+     www.persee.fr | spn-error                           |    75
+     www.persee.fr | redirect-loop                       |    23
+     www.persee.fr | success                             |     8
+     www.persee.fr | spn2-error                          |     2
+     www.persee.fr | spn2-error:soft-time-limit-exceeded |     1
+     www.persee.fr | wrong-mimetype                      |     1
+    (10 rows)
+
+journals.openedition.org
+
+    PDF access is via "freemium" subscription. Get redirects to:
+
+        https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+    Content is technically open access (HTML and license; for all content?),
+    but can't be crawled as PDF without subscription.
+
+              domain          |         status          | count 
+    --------------------------+-------------------------+-------
+     journals.openedition.org | redirect-loop           | 29587
+     journals.openedition.org | success                 |  6821
+     journals.openedition.org | no-pdf-link             |  1507
+     journals.openedition.org | no-capture              |   412
+     journals.openedition.org | wayback-error           |    32
+     journals.openedition.org | wrong-mimetype          |    27
+     journals.openedition.org | terminal-bad-status     |    13
+     journals.openedition.org | spn2-cdx-lookup-failure |     4
+     journals.openedition.org | spn-remote-error        |     1
+     journals.openedition.org | null-body               |     1
+     journals.openedition.org | cdx-error               |     1
+    (11 rows)
+
+journals.lww.com
+
+    no-pdf-link
+
+          domain      |     status     | count 
+    ------------------+----------------+-------
+     journals.lww.com | no-pdf-link    | 11668
+     journals.lww.com | wrong-mimetype |   131
+    (2 rows)
+
+    doi prefix: 10.1097
+
+    <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+    data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+    Some weird thing going on, maybe they are blocking-via-redirect based on
+    our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+    no-pdf-link
+
+koreascience.or.kr          | no-pdf-link         |   8867
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'osapublishing.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%osapublishing.org%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+www.cairn.info              | link-loop           |   8717
+
+easy.dans.knaw.nl           | no-pdf-link         |   8262
+scielo.conicyt.cl           | no-pdf-link         |   7925
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'scielo.conicyt.cl'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+            AND status = 'terminal-bad-status'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+
+          domain       |       status        | count 
+    -------------------+---------------------+-------
+     scielo.conicyt.cl | no-pdf-link         |  7926
+     scielo.conicyt.cl | success             |  4972
+     scielo.conicyt.cl | terminal-bad-status |  1474
+     scielo.conicyt.cl | wrong-mimetype      |     6
+     scielo.conicyt.cl | no-capture          |     4
+     scielo.conicyt.cl | null-body           |     1
+
+
+     pdf         | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 |                  200 | 
+     pdf         | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f   | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 |                  200 | 
+
+    These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+    pdf         | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 |                  200 | 
+    pdf         | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 |                  200 | 
+    pdf         | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00  | f   | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 |                  200 | 
+
+    Look like web/xml only.
+
+    TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr               | no-pdf-link         |   6842
+www.m-hikari.com            | no-pdf-link         |   6763
+cshprotocols.cshlp.org      | no-pdf-link         |   6553
+www.bibliotekevirtual.org   | no-pdf-link         |   6309
+data.hpc.imperial.ac.uk     | no-pdf-link         |   6071
+projecteuclid.org           | link-loop           |   5970
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'projecteuclid.org'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%projecteuclid.org%'
+            AND status = 'link-loop'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+          domain       |         status          | count 
+    -------------------+-------------------------+-------
+     projecteuclid.org | link-loop               |  5985
+     projecteuclid.org | success                 |    26
+     projecteuclid.org | wayback-error           |    26
+     projecteuclid.org | wrong-mimetype          |    17
+     projecteuclid.org | spn2-cdx-lookup-failure |     4
+     projecteuclid.org | other-mimetype          |     4
+     projecteuclid.org | no-capture              |     3
+     projecteuclid.org | terminal-bad-status     |     2
+     projecteuclid.org | spn2-error:job-failed   |     1
+     projecteuclid.org | spn-remote-error        |     1
+    (10 rows)
+
+    Doing a cookie check and redirect.
+
+    TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br               | no-pdf-link         |   5823
+
+    SELECT domain, status, COUNT((domain, status))
+        FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+        WHERE t1.domain = 'www.scielo.br'
+        GROUP BY domain, status
+        ORDER BY COUNT DESC;
+
+    SELECT * FROM ingest_file_result
+        WHERE terminal_url LIKE '%www.scielo.br%'
+            AND status = 'no-pdf-link'
+        ORDER BY updated DESC
+        LIMIT 10;
+
+        domain     |         status          | count 
+    ---------------+-------------------------+-------
+     www.scielo.br | success                 | 35150
+     www.scielo.br | no-pdf-link             |  5839
+     www.scielo.br | terminal-bad-status     |   429
+     www.scielo.br | no-capture              |   189
+     www.scielo.br | wrong-mimetype          |     7
+     www.scielo.br | spn2-cdx-lookup-failure |     2
+    (6 rows)
+
+    Seems to just be the subset with no PDFs.
+
+get.iedadata.org            | no-pdf-link         |   5822
+www.pdcnet.org              | no-pdf-link         |   5798
+publications.rwth-aachen.de | no-pdf-link         |   5323
+www.sciencedomain.org       | no-pdf-link         |   5231
+medicalforum.ch             | terminal-bad-status |   4574
+jrnl.nau.edu.ua             | link-loop           |   4145
+ojs.academypublisher.com    | no-pdf-link         |   4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop       | 240967
+  dialnet.unirioja.es | terminal-bad-status |  20320
+    => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link |  72639
+    => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop |  42859
+- www.redalyc.org:9081 | no-pdf-link |  10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop |  27328
+    => blocking is pretty aggressive, using cookies or referrer or something.
+       maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+    SELECT COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+    => 589
+
+    SELECT ingest_file_result.status, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY ingest_file_result.status;
+
+         status     | count 
+    ----------------+-------
+     cdx-error      |     1
+     success        |   587
+     wrong-mimetype |     1
+    (3 rows)
+
+
+    SELECT grobid.status_code, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY grobid.status_code;
+
+     status_code | count 
+    -------------+-------
+             200 |   385
+             500 |   202
+                 |     2
+    (3 rows)
+
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 500
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+    => COPY 202
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 200
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+    => COPY 385
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    | jq '. + {force_recrawl: true}' -c \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n100 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n10000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1