aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
Diffstat (limited to 'notes')
-rw-r--r--notes/dryad_datasets.md17
-rw-r--r--notes/examples/2021-11-12_broken_grobid_xml.md83
-rw-r--r--notes/examples/dataset_examples.txt52
-rw-r--r--notes/examples/html_test_journals.txt153
-rw-r--r--notes/examples/random_datasets.md19
-rw-r--r--notes/fuzzy_match_notes.md148
-rw-r--r--notes/html_ingest_notes.md318
-rw-r--r--notes/ingest/2019-10-23_testing.md (renamed from notes/ingest/20191023_testing.md)0
-rw-r--r--notes/ingest/2020-01-14_bulk.md (renamed from notes/ingest/20200114_bulk_ingests.md)0
-rw-r--r--notes/ingest/2020-02_unpaywall.md (renamed from notes/ingest/2020-02-14_unpaywall_ingest.md)148
-rw-r--r--notes/ingest/2020-03-oa_but_not_marked.md25
-rw-r--r--notes/ingest/2020-03_mag.md576
-rw-r--r--notes/ingest/2020-03_s2.md35
-rw-r--r--notes/ingest/2020-04-13_covid19.md73
-rw-r--r--notes/ingest/2020-04_datacite.md121
-rw-r--r--notes/ingest/2020-04_unpaywall.md312
-rw-r--r--notes/ingest/2020-05_oai_pmh.md428
-rw-r--r--notes/ingest/2020-05_pubmed.md10
-rw-r--r--notes/ingest/2020-07_mag.md353
-rw-r--r--notes/ingest/2020-08_daily_improvements.md202
-rw-r--r--notes/ingest/2020-09_oa_doi.md352
-rw-r--r--notes/ingest/2020-09_reingest.md197
-rw-r--r--notes/ingest/2020-09_scielo.md21
-rw-r--r--notes/ingest/2020-10_daily.md193
-rw-r--r--notes/ingest/2020-10_unpaywall.md286
-rw-r--r--notes/ingest/2020-11-04_arxiv.md12
-rw-r--r--notes/ingest/2020-11_doaj.md295
-rw-r--r--notes/ingest/2020-12-08_patch_crawl_notes.md111
-rw-r--r--notes/ingest/2021-04_unpaywall.md368
-rw-r--r--notes/ingest/2021-05_daily_improvements.md480
-rw-r--r--notes/ingest/2021-07_unpaywall.md320
-rw-r--r--notes/ingest/2021-08_mag.md400
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md1578
-rw-r--r--notes/ingest/2021-09-03_daily_improvements.md1021
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md678
-rw-r--r--notes/ingest/2021-12-13_datasets.md504
-rw-r--r--notes/ingest/2022-01-06_patch_crawl.md398
-rw-r--r--notes/ingest/2022-01-13_doi_crawl.md248
-rw-r--r--notes/ingest/2022-03_doaj.md278
-rw-r--r--notes/ingest/2022-03_oaipmh.md40
-rw-r--r--notes/ingest/2022-04_targeted.md144
-rw-r--r--notes/ingest/2022-04_unpaywall.md278
-rw-r--r--notes/ingest/2022-07-15_ingest_fixes.md831
-rw-r--r--notes/ingest/2022-07-19_dblp.md50
-rw-r--r--notes/ingest/2022-07_doaj.md199
-rw-r--r--notes/ingest/2022-07_targeted.md140
-rw-r--r--notes/ingest/2022-09_oaipmh.md397
-rw-r--r--notes/ingest/NEXT.md52
-rw-r--r--notes/ingest_domains.txt294
-rw-r--r--notes/possible_ingest_targets.txt15
-rw-r--r--notes/tasks/2020-01-27_cleanup_cdx.md34
-rw-r--r--notes/tasks/2020-02-14_pdftrio.md162
-rw-r--r--notes/tasks/2020-07-22_processing_holes.md120
-rw-r--r--notes/tasks/2020-08-20_file_meta.md66
-rw-r--r--notes/tasks/2020-10-21_pdfextract_holes.md74
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md70
-rw-r--r--notes/tasks/2021-10-29_crossref_refs_backfill.md235
-rw-r--r--notes/tasks/2021-12-06_regrobid.md380
-rw-r--r--notes/tasks/2022-01-07_grobid_platform_pdfs.md23
-rw-r--r--notes/tasks/2022-03-07_ukraine_firedrill.md225
-rw-r--r--notes/tasks/2022-04-27_pdf_url_lists.md72
-rw-r--r--notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md132
62 files changed, 14846 insertions, 0 deletions
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+ https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+ sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+ sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata
+ ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+ d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"}
+ 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"}
+ 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+ FIXED
+ f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+ https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+ FIXED
+ c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"}
+ https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+ FIXED (and good)
+ 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"}
+ https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+ FIXED
+ metadata quality mixed, but complex document (?)
+ 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+ https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+ FIXED
+ 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+ https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+ FIXED
+ 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+ not found
+ acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+ https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+ BROKEN: not well-formed (invalid token): line 60, column 45
+ <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+ 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+ not found
+ c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+ https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+ BROKEN: not well-formed (invalid token): line 58, column 45
+ <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+ 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+ not found
+ 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+ not found
+ f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+ https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+ FIXED, good
+ f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+ FIXED
+ 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+ https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+ FIXED
+ 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+ FIXED
+ (21 rows)
+
+Some other errors from other queries:
+
+ d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+ https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+ FIXED: with 0.7.0+
+
+ 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+ https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+ still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+ BAD PDF ("no pages" in evince)
+
+ d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+ https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+ FIXED
+
+ 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+ https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+ FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+ .data.id
+ .data.latestVersion.datasetPersistentId
+ .data.latestVersion.versionNumber, .versionMinorNumber
+ .data.latestVersion.files[]
+ .dataFile
+ .contentType (mimetype)
+ .filename
+ .filesize (int, bytes)
+ .md5
+ .persistendId
+ .description
+ .label (filename?)
+ .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+ live web
+ no longer active
+ http://www.dlib.org/back.html
+
+NLM technical bulletin
+ https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+ https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+ live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+ issn:2551-1289
+ vanished
+ blog format
+ http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+ https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+ https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+ ISSN: 1529-3041
+
+ defunct since ~2010
+ simple HTML articles
+ references
+ http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+ http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+ PDF articles (not HTML)
+ http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+ zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+ => 2,184,954
+
+ cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+ 254817 link.springer.com
+ 145159 www.scielo.br
+ 78044 journal.frontiersin.org
+ 77394 www.frontiersin.org
+ 40849 www.dovepress.com
+ 19024 dergipark.org.tr
+ 18758 periodicos.ufsc.br
+ 16346 www.revistas.usp.br
+ 15872 revistas.unal.edu.co
+ 15527 revistas.ucm.es
+ 13669 revistas.usal.es
+ 12640 dergipark.gov.tr
+ 12111 journals.rudn.ru
+ 11839 www.scielosp.org
+ 11277 www.karger.com
+ 10827 www.journals.vu.lt
+ 10318
+ 9854 peerj.com
+ 9100 ojs.unud.ac.id
+ 8581 jurnal.ugm.ac.id
+ 8261 riviste.unimi.it
+ 8012 journals.uran.ua
+ 7454 revistas.pucp.edu.pe
+ 7264 journals.vgtu.lt
+ 7200 publicaciones.banrepcultural.org
+
+ cat html_fulltext_urls.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.filtered.txt
+ => 1,579,257
+
+ zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+ => 560k
+
+ cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+ 40849 www.dovepress.com
+ 10570 journals.rudn.ru
+ 10494 dergipark.org.tr
+ 10233 revistas.unal.edu.co
+ 9981 dergipark.gov.tr
+ 9428 revistas.usal.es
+ 8292 revistas.ucm.es
+ 7200 publicaciones.banrepcultural.org
+ 6953 revistas.pucp.edu.pe
+ 6000 www.scielosp.org
+ 5962 www.scielo.br
+ 5621 www.richtmann.org
+ 5123 scielo.sld.cu
+ 5067 ojs.unud.ac.id
+ 4838 periodicos.ufsc.br
+ 4736 revistasonlinepre.inap.es
+ 4486 journal.fi
+ 4221 www.seer.ufu.br
+ 3553 revistas.uam.es
+ 3492 revistas.pucsp.br
+ 3060 www.scielo.org.co
+ 2991 scielo.isciii.es
+ 2802 seer.ufrgs.br
+ 2692 revistas.unc.edu.ar
+ 2685 srl.si
+
+ cat html_fulltext_urls.no_doi.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.no_doi.filtered.txt
+ => 518,608
+
+ zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+ https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+ https://journal.umy.ac.id/index.php/st/article/view/3297
+ https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+ http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+ http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+ https://journal.fi/inf/article/view/59430
+ http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+ https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+ https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+ http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+ http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+ http://journal.bdfish.org/index.php/fisheries/article/view/91
+ https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+ https://www.lithosphere.ru/jour/article/view/779
+ https://journals.hioa.no/index.php/seminar/article/view/2412
+ http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+ https://www.kmuj.kmu.edu.pk/article/view/15698
+ http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+ https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+ http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/fuzzy_match_notes.md b/notes/fuzzy_match_notes.md
new file mode 100644
index 0000000..a87364c
--- /dev/null
+++ b/notes/fuzzy_match_notes.md
@@ -0,0 +1,148 @@
+
+These are notes on how bibliographic metadata matches (of records) and
+slugification (to create lookup keys on title strings) worked in the past in
+the sandcrawler repository. Eg, circa 2018.
+
+## Scala Slug-ification
+
+Original title strings longer than 1023 characters were rejected (before
+slug-ifying).
+
+There was a "slug-denylist". Additionally, scorable strings needed to be
+between 8 and 1023 characters (not bytes) long (inclusive)
+
+Slugification transform was:
+
+- lower-case
+- remove whitespace ("\s")
+- strip specific accent characters:
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+- remove all '\p{InCombiningDiacriticalMarks}'
+- remove punctuation:
+ \p{Punct}
+ ’·“”‘’“”«»「」¿–±§
+
+Partially adapted from apache commons: <https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934>
+
+My original notes/proposal:
+
+1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+2. strip accents
+3. "lower-case" (unicode-aware)
+4. do any final custom/manual mappings
+
+Resulting slugs less than 8 characters long were rejected, and slugs were
+checked against a denylist.
+
+Only 554 entries in the denylist; could just ship that in the library.
+
+
+## Python Tokenization
+
+- "&apos;" -> "'"
+- remove non "isalnum()" characters
+- encode as ASCII; this removes diacritics etc, but also all non-latin character sets
+- optionally remove all whitespace
+
+
+## Python GROBID Cleanups
+
+These are likely pretty GROBID-specific. Article title was required, but any of
+the other filtered-out fields just resulted in partial metadata. These filters
+are the result of lots of manual verification of results, and doing things like
+taking truncating titles and looking at the most popular prefixes for a large
+random sample.
+
+Same denylist for title slugs as Scala, plus:
+
+ editorial
+ advertisement
+ bookreviews
+ reviews
+ nr
+ abstractoriginalarticle
+ originalarticle
+ impactfactor
+ articlenumber
+
+Other filters on title strings (any of these bad):
+
+- 500 or more characters long
+- tokenized string less than 10 characters
+- tokenized starts with 'nr' or 'issn'
+- lowercase starts with 'int j' or '.int j'
+- contains both "volume" and "issue"
+- contains "downloadedfrom"
+- fewer than 2 or more than 50 tokens (words)
+- more than 12 tokens only a single character long
+- more than three ":"; more than one "|"; more than one "."
+
+Remove title prefixes (but allow):
+
+ "Title: "
+ "Original Article: "
+ "Original Article "
+ "Article: "
+
+Denylist for authors:
+
+ phd
+ phdstudent
+
+Journal name processing:
+
+- apply title denylist
+- remove prefixes
+ characters: /~&©
+ Original Research Article
+ Original Article
+ Research Article
+ Available online www.jocpr.com
+- remove suffixes
+ Available online at www.sciarena.com
+ Original Article
+ Available online at
+ ISSN
+ ISSUE
+- remove anywhere
+ e-ISSN
+ p-ISSN
+
+## Python Grouping Comparison
+
+Would consume joined groups, row-by-row. At most 10 matches per group; any more
+and skip (this was for file-to-release).
+
+Overall matching requirements:
+
+- string similarity threshold from scala code
+ https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java/16018452#16018452
+- authors should be consistent
+ - convert one author list into space-separated tokens
+ - remove "jr." from all author token lists
+ - the last word for each author full name in the other list (eg, the lastname),
+ tokenized, must be in the token set
+- if both years defined, then must match exactly (integers)
+
+In the code, there is a note:
+
+ Note: the actual importer/merger should filter the following patterns out:
+ - container title has "letter" and "diar"
+ - contribs (authors) contain "&NA;"
+ - dates differ (not just year)
+
+
+## Scala Metadata Keys
+
+Only the titles were ever actually used (in scala), but the keys allowed were:
+
+- title
+- authors (list of strings)
+- year (int)
+- contentType
+- doi
+
diff --git a/notes/html_ingest_notes.md b/notes/html_ingest_notes.md
new file mode 100644
index 0000000..a1a91f3
--- /dev/null
+++ b/notes/html_ingest_notes.md
@@ -0,0 +1,318 @@
+
+## Current Plan
+
+- selectolax to extract metadata and quickly filter (speed)
+ => eg, differentiate landing pages from fulltext
+ => also embed URLs?
+- trafilatura for fulltext body extract
+- no solution yet for reference parsing
+ => maybe trafilatura XML-TEI parsing, then GROBID?
+ => especially if DOI/identifier/URL is in the reference
+
+
+
+TODO:
+x print/wrap error condition better
+x serialize dates (pydantic)
+x CDX lookup "closest" to capture datetime (or by month)
+x firstmonday no extracted fulltext/XML
+x apply URL base fixup to fulltext URLs
+x XML alternative detection
+x basic ingest worker, kafka topics, persist workers, sql table, etc
+- ingest worker: landing page to actual fulltext (eg, OJS)
+- broken? https://betterexplained.com/articles/colorized-math-equations/
+
+Ponder:
+- CDX lookup older successful captures
+ http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ => optional filter by status? "reduce" by month/year?
+- detect scope heuristically
+ bepress_is_article_cover_page 1
+ citation_fulltext_world_readable "" (eg, distill)
+- non-success subresource fetches
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+- redirects: keep start URL?
+
+Later:
+- XML URL extraction
+ https://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-19652002000200001&lng=en&nrm=iso&tlng=pt
+ <a href="http://www.scielo.br/scieloOrg/php/articleXML.php?pid=S0100-19652002000200001&amp;lang=en" rel="nofollow" target="xml">
+- selectolax bug? hangs: `css_first("meta['thing']")`
+- youtube embed
+ => download/include actual video file?
+- parse references in citation headers
+- try parsing references in HTML fulltext
+
+## Testing URLs
+
+- PLOS
+ https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949
+ TODO: "May 9, 2014"
+ TODO: appendix
+- peerj
+ https://peerj.com/articles/4375/
+- scielo
+ http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032020000200081&lng=en&nrm=iso&tlng=es
+ bunch of little icon .png, but ok
+ redirect of an image not saved in webcapture
+- wordpress
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+ no HTML meta? hrm
+- old OJS
+ (pdf only) http://rjh.folium.ru/index.php/rjh/article/view/1511
+- new OJS
+ https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729
+- plain HTML
+ http://journal.sjdm.org/12/12627/jdm12627.html
+- blogs/essays
+ http://symbolflux.com/lodessay/
+ https://betterexplained.com/articles/colorized-math-equations/
+ https://web.archive.org/web/20120418231513/http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ https://research.google.com/bigpicture/attacking-discrimination-in-ml/
+ http://www.econgraphs.org/
+- journal homepage (not fulltext)
+- OJS new landing page (not fulltext)
+- OJS old (not fulltext)
+ http://rjh.folium.ru/index.php/rjh/index
+ http://rjh.folium.ru/index.php/rjh/issue/view/106
+ http://rjh.folium.ru/index.php/rjh/article/view/382
+- distill
+ https://distill.pub/2020/bayesian-optimization/
+ https://distill.pub/2018/feature-wise-transformations/
+- youtube video embed
+ http://www.cond.org/persalog.html
+- youtube video direct?
+- github: project README?
+- wikipedia
+
+## Background Research
+
+- scrapy (?)
+- requests-html: can run javascript
+ => good for metadata extraction?
+- selectolax
+- scrapely: give HTML and extracted text, it builds the parser
+ => good for difficult one-off cases?
+- https://rushter.com/blog/python-fast-html-parser/
+- WET generation from WARC, a la common crawl
+- https://towardsdatascience.com/categorizing-world-wide-web-c130abd9b717
+
+Other random stuff:
+- distilBERT: most BERT accuracy, 0.4 factor latency (faster)?
+ https://medium.com/huggingface/distilbert-8cf3380435b5
+- htmldate: finds "date of publication" for a document
+- adblockparser
+ => good as a filter in HTML ingest
+- w3lib: utility library. unicode conversion; cleanups; etc
+- courlan: clean/normalize/sample large URL lists
+ => https://github.com/adbar/courlan
+
+### Main Text Extraction
+
+Things to try:
+
+- newspaper3k
+ => basic article extraction. lxml
+- trafilatura
+ => TEI-XML output!
+ => looks very promising
+ => falls back to readability and justext
+- python-readability
+ => improved vs newspaper?
+- dragnet
+- eatiht
+- jusText
+- inscriptis
+ => emphasis on shape/readability of text output? compare with lynx
+- Goose3
+ => metadata and article text
+- news-please
+ => very full-featured. build on scrapy, newspaper, readability
+ => can iterate over common crawl?
+- html2text
+ => actually HTML-to-markdown; no or little "boilerplate removal"
+- boilerpipe (Java)
+ boilerpipe3 (wrapper)
+ boilerpy3 (port)
+
+Comparisons and articles:
+
+- https://www.diffbot.com/benefits/comparison/
+- https://github.com/scrapinghub/article-extraction-benchmark
+ - https://github.com/scrapinghub/article-extraction-benchmark/releases/download/v1.0.0/paper-v1.0.0.pdf
+- https://github.com/rundimeco/waddle
+
+- https://moz.com/devblog/benchmarking-python-content-extraction-algorithms-dragnet-readability-goose-and-eatiht
+- https://hal.archives-ouvertes.fr/hal-02768510v3/document (fr; June 2020)
+ https://translate.google.com/translate?sl=auto&tl=en&u=https%3A%2F%2Fhal.archives-ouvertes.fr%2Fhal-02768510v3%2Fdocument
+- http://eprints.fri.uni-lj.si/1718/1/Kovacic-1.pdf (2012)
+- "Generic Web Content Extraction with Open-Source Software" (2020; trafilatura)
+- "Out-of-the-Box and Into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools"
+ https://hal.archives-ouvertes.fr/hal-02732851/document
+ very on-topic
+- https://cloud.google.com/blog/products/gcp/problem-solving-with-ml-automatic-document-classification
+
+### Reference/Citation Extraction
+
+"Locating and parsing bibliographic references in HTML medical articles"
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2903768/
+
+cb2bib (in debian/ubuntu)
+
+
+### Metadata Extraction
+
+OJS 3.x seems to have `citation_fulltext_html_url`. Annoyingly, has an iframe.
+
+http://documents.clockss.org/index.php/LOCKSS:_Extracting_Bibliographic_Metadata
+
+https://blog.dshr.org/2013/04/talk-on-lockss-metadata-extraction-at.html
+
+"OXPath": declaritive XPath extension for scraping metadata
+https://journal.code4lib.org/articles/13007
+
+
+## newspaper3k experimentation
+
+ import newspaper
+
+ import nltk
+ nltk.download('punkt')
+
+ # first mondays (OJS) fulltext
+ monday = newspaper.Article("https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729?inline=1")
+ # => ugh, iframe
+ monday.download()
+ monday.parse() # several seconds
+
+ monday.title
+ # Surveillance, stigma and sociotechnical design for HIV
+ monday.text
+ # reasonable; similar to pdftotext?
+ monday.authors
+ # empty
+ monday.images
+ # reasonable?
+
+ nih = newspaper.Article('https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html')
+ nih.download()
+ nih.parse()
+ nih.nlp()
+
+ nih.title
+ # Migration of Monographic Citations to LocatorPlus: Merge Project. NLM Technical Bulletin. Jul-Aug 2002
+ # duplicate journal name in title
+ nih.authors
+ # none
+ nih.text
+ # Ok. missing first character, weirdly
+
+ genders = newspaper.Article('https://web.archive.org/web/20141230080932id_/http://www.genders.org/g58/g58_fairlie.html')
+ genders.download()
+ genders.parse()
+
+ genders.title
+ # Presenting innovative theories in art, literature, history, music, TV and film.
+ # nope: this is title of the journal
+
+ genders.text
+ # Ok. includes title and author in the body.
+
+ dlib = newspaper.Article('http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html')
+ dlib.download()
+ dlib.parse()
+
+ dlib.title
+ # Transforming Libraries and Archives through Crowdsourcing
+ dlib.authors()
+ # none
+ dlib.text
+ # some other junk, but main body there
+
+## trafilatura experimentation
+
+ trafilatura --json -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html' | jq .
+
+ trafilatura --xmltei -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html'
+
+Does not work with `first_monday_ojs_inline`?
+
+May need to test/compare more.
+
+Examples/bugs:
+
+ http://web.archive.org/web/20081120141035id_/http://www.mundanebehavior.org/issues/v5n1/jones.htm
+ poor title detection
+
+ generally, author detection not great.
+ not, apparently, using detection of dc.authors etc
+
+
+## Prod Deployment Notes (2020-12-14)
+
+Created `html_meta` table in `sandcrawler-db`.
+
+Updated ansible roles to deploy persist and import workers. Then ran the roles
+and enabled:
+
+- sandcrawler database (aitio)
+ - sandcrawler-persist-ingest-file-worker@1: restarted
+- blobs (wbgrp-svc169)
+ - sandcrawler-persist-html-teixml-worker@1: started and enabled
+ - sandcrawler-persist-xml-doc-worker@1: started and enabled
+- fatcat prod worker (wbgrp-svc502)
+ - fatcat-import-ingest-web-worker: started and enabled
+
+Test some d-lib and first monday ingests:
+
+ # dlib
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Counter({'estimate': 803, 'ingest_request': 50, 'elasticsearch_release': 50, 'kafka': 50})
+
+ # first monday
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id svz5ul6qozdjhjhk7d627avuja
+
+Starting:
+
+ d-lib: 253 / 1056 preserved (https://fatcat.wiki/container/ugbiirfvufgcjkx33r3cmemcuu/coverage)
+
+Initially, `fatcat-import-ingest-web-worker` is seeing these but doesn't seem
+to be importing.
+
+ # postgresql shell
+ select sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count from html_meta;
+ => initially has_teixml is false for all
+ => fixed in an update
+
+ # weed shell
+ > fs.ls /buckets/sandcrawler/html_body
+ [...]
+ > fs.cat /buckets/sandcrawler/html_body/77/75/7775adf8c7e19151bbe887bfa08a575483291d7c.tei.xml
+ [looks like fine TEI-XML]
+
+Going to debug ingest issue by dumping results to disk and importing manually
+(best way to see counts):
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o -10 | rg html | head -n10 | jq . -c > web_ingest_results.json
+
+ export FATCAT_AUTH_WORKER_CRAWL=[...]
+ ./fatcat_import.py ingest-web-results web_ingest_results.json
+ => Counter({'total': 10, 'skip-update-disabled': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # did some patching (f7a75a01), then re-ran twice and got:
+ => Counter({'total': 10, 'insert': 9, 'skip': 1, 'skip-hit': 1, 'update': 0, 'exists': 0})
+ => Counter({'total': 10, 'exists': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0})
+
+ # looks good!
+
+Re-ingesting all of d-lib:
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Expecting 803 release objects in search queries
+ => Counter({'ingest_request': 803, 'elasticsearch_release': 803, 'estimate': 803, 'kafka': 803})
+
+TODO:
+
+- release ES transform isn't counting these as `in_ia` or preserved (code-only change)
+- no indication in search results (ES schema change)
+- ingest tool should probably look at `in_ia_html` or `in_ia_pdf` for PDF/XML queries (or a `types_in_ia` list?)
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/2019-10-23_testing.md
index 481c4e2..481c4e2 100644
--- a/notes/ingest/20191023_testing.md
+++ b/notes/ingest/2019-10-23_testing.md
diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/2020-01-14_bulk.md
index 9d05cda..9d05cda 100644
--- a/notes/ingest/20200114_bulk_ingests.md
+++ b/notes/ingest/2020-01-14_bulk.md
diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02_unpaywall.md
index 0bedfdb..e18a2ff 100644
--- a/notes/ingest/2020-02-14_unpaywall_ingest.md
+++ b/notes/ingest/2020-02_unpaywall.md
@@ -474,3 +474,151 @@ Note: will probably end up re-running the below after crawling+ingesting the abo
) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
=> 654,885
+## Batch Ingest
+
+Test small batch:
+
+ head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+ cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # there was a broken line in there, so...
+ # parse error: Expected separator between values at line 1367873, column 175
+ # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
+ tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Note that the crawl is not entirely complete and not all CDX seem to have been
+loaded, so may need to iterate. About 10% are still "no capture". May want or
+need to additionally crawl the terminal URLs, not the base URLs.
+
+## Post-ingest stats
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 17354494
+ no-pdf-link | 1471076
+ no-capture | 1135992
+ redirect-loop | 837842
+ terminal-bad-status | 803081
+ cdx-error | 219746
+ wrong-mimetype | 100723
+ link-loop | 16013
+ wayback-error | 12448
+ null-body | 9444
+ redirects-exceeded | 600
+ petabox-error | 411
+ bad-redirect | 17
+ bad-gzip-encoding | 4
+ spn2-cdx-lookup-failure | 3
+ gateway-timeout | 1
+ spn2-error:job-failed | 1
+ spn2-error | 1
+ (18 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 330211
+ watermark.silverchair.com | terminal-bad-status | 324599
+ www.tandfonline.com | no-pdf-link | 242724
+ journals.sagepub.com | no-pdf-link | 202050
+ iopscience.iop.org | terminal-bad-status | 144063
+ files-journal-api.frontiersin.org | terminal-bad-status | 121719
+ pubs.acs.org | no-pdf-link | 104535
+ www.ahajournals.org | no-pdf-link | 102653
+ society.kisti.re.kr | no-pdf-link | 101787
+ www.degruyter.com | redirect-loop | 95130
+ www.nature.com | redirect-loop | 87534
+ onlinelibrary.wiley.com | no-pdf-link | 84432
+ www.cell.com | redirect-loop | 61496
+ www.degruyter.com | terminal-bad-status | 42919
+ babel.hathitrust.org | terminal-bad-status | 41813
+ www.ncbi.nlm.nih.gov | redirect-loop | 40488
+ scialert.net | no-pdf-link | 38341
+ ashpublications.org | no-pdf-link | 34889
+ dialnet.unirioja.es | terminal-bad-status | 32076
+ www.journal.csj.jp | no-pdf-link | 30881
+ pure.mpg.de | redirect-loop | 26163
+ www.jci.org | redirect-loop | 24701
+ espace.library.uq.edu.au | redirect-loop | 24591
+ www.valueinhealthjournal.com | redirect-loop | 23740
+ www.vr-elibrary.de | no-pdf-link | 23332
+ aip.scitation.org | wrong-mimetype | 22144
+ osf.io | redirect-loop | 18513
+ www.journals.elsevier.com | no-pdf-link | 16710
+ www.spandidos-publications.com | redirect-loop | 15711
+ www.biorxiv.org | wrong-mimetype | 15513
+ (30 rows)
+
+Dump lists for another iteration of bulk ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
+ => 278,876
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
+ =>
+
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
+
+ cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md
new file mode 100644
index 0000000..428ce05
--- /dev/null
+++ b/notes/ingest/2020-03_mag.md
@@ -0,0 +1,576 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+ => previously 25.6M
+ => 25.6M 2:29:43 [2.85k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+ => 4.3M 0:25:45 [2.78k/s]
+
+ export LC_ALL=C
+ cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+ zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+ => 6,504,907
+
+ zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+ => 4,369,832
+
+ cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+ => previously 15,707,405
+ => 15,702,581
+
+ cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+ => 0
+ URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+ # small sample
+ head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 10, 'skip-result-fields': 10})
+ JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ => 4.3M 0:16:46 [4.27k/s]
+ Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+ JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+ => hit a bug on first attempt, which is why total/insert results don't match
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+ => 25.6M 2:21:54 [3.01k/s]
+ Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+ JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+After just PMID links:
+
+ status | count
+ ---------------------+---------
+ | 3000115
+ success | 1126881
+ no-capture | 69459
+ terminal-bad-status | 30259
+ redirect-loop | 11656
+ no-pdf-link | 2836
+ wrong-mimetype | 1456
+ link-loop | 1259
+ wayback-error | 1232
+ cdx-error | 932
+ null-body | 85
+ petabox-error | 50
+ bad-redirect | 1
+ (13 rows)
+
+After all links:
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag';
+ => 25596563
+
+
+ status | count
+ ---------------------+----------
+ | 21130841
+ success | 3915682
+ no-capture | 391813
+ terminal-bad-status | 76488
+ redirect-loop | 44202
+ wrong-mimetype | 16418
+ no-pdf-link | 10995
+ wayback-error | 3679
+ cdx-error | 3414
+ link-loop | 2098
+ null-body | 709
+ petabox-error | 221
+ bad-gzip-encoding | 2
+ bad-redirect | 1
+ (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+ => COPY 21,130,841
+
+Transform and shuf:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+ => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+ bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+ 29007
+ bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+ 34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+ head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+ cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+ cd /grande/snapshots/
+ zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+ cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ partner ingests (see above)
+ => 2020-03-05 12:49: 118,396
+ 1056543 mag_noingest_20200305.ingest_request.split_00.json
+ => 2020-03-05 14:34: 1,055,224
+ => check on stats/ratios; filter by ingest update time?
+ 1056542 mag_noingest_20200305.ingest_request.split_01.json
+ 1056542 mag_noingest_20200305.ingest_request.split_02.json
+ 1056542 mag_noingest_20200305.ingest_request.split_03.json
+ 1056542 mag_noingest_20200305.ingest_request.split_04.json
+ 1056542 mag_noingest_20200305.ingest_request.split_05.json
+ 1056542 mag_noingest_20200305.ingest_request.split_06.json
+ 1056542 mag_noingest_20200305.ingest_request.split_07.json
+ 1056542 mag_noingest_20200305.ingest_request.split_08.json
+ 1056542 mag_noingest_20200305.ingest_request.split_09.json
+ => 2020-03-05 18:04: 10,009,297
+ => 2020-03-06 16:53: 6,553,946
+ 1056542 mag_noingest_20200305.ingest_request.split_10.json
+ 1056542 mag_noingest_20200305.ingest_request.split_11.json
+ 1056542 mag_noingest_20200305.ingest_request.split_12.json
+ 1056542 mag_noingest_20200305.ingest_request.split_13.json
+ 1056542 mag_noingest_20200305.ingest_request.split_14.json
+ 1056542 mag_noingest_20200305.ingest_request.split_15.json
+ 1056542 mag_noingest_20200305.ingest_request.split_16.json
+ 1056542 mag_noingest_20200305.ingest_request.split_17.json
+ 1056542 mag_noingest_20200305.ingest_request.split_18.json
+ 1056542 mag_noingest_20200305.ingest_request.split_19.json
+ => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ ---------------------+----------
+ no-capture | 12237193
+ success | 11991293
+ no-pdf-link | 521691
+ redirect-loop | 437192
+ terminal-bad-status | 231181
+ link-loop | 92633
+ cdx-error | 33631
+ wrong-mimetype | 28638
+ wayback-error | 19651
+ null-body | 2682
+ petabox-error | 727
+ | 47
+ bad-redirect | 44
+ bad-gzip-encoding | 7
+ (14 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------------+---------------------+--------
+ dialnet.unirioja.es | redirect-loop | 240967
+ onlinelibrary.wiley.com | no-pdf-link | 147696
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ iopscience.iop.org | terminal-bad-status | 69591
+ febs.onlinelibrary.wiley.com | no-pdf-link | 49874
+ www.researchgate.net | redirect-loop | 42859
+ journals.sagepub.com | no-pdf-link | 27448
+ papers.ssrn.com | redirect-loop | 27328
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 20232
+ science.sciencemag.org | link-loop | 17811
+ espace.library.uq.edu.au | redirect-loop | 17185
+ bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301
+ anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746
+ www.tandfonline.com | no-pdf-link | 13303
+ aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070
+ link.springer.com | redirect-loop | 10594
+ www.redalyc.org:9081 | no-pdf-link | 10515
+ watermark.silverchair.com | terminal-bad-status | 9739
+ www.bmj.com | link-loop | 9389
+ www.repository.naturalis.nl | redirect-loop | 8213
+ bjp.rcpsych.org | link-loop | 8045
+ aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814
+ nph.onlinelibrary.wiley.com | no-pdf-link | 7801
+ iopscience.iop.org | redirect-loop | 7697
+ journals.tubitak.gov.tr | wrong-mimetype | 7159
+ www.biorxiv.org | wrong-mimetype | 7067
+ www.erudit.org | redirect-loop | 6819
+ besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254
+ (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------+------------+--------
+ ieeexplore.ieee.org | no-capture | 957835
+ link.springer.com | no-capture | 394121
+ www.researchgate.net | no-capture | 376974
+ cyberleninka.ru | no-capture | 376012
+ iopscience.iop.org | no-capture | 348791
+ papers.ssrn.com | no-capture | 286860
+ dergipark.org.tr | no-capture | 217556
+ dialnet.unirioja.es | no-capture | 214398
+ academic.oup.com | no-capture | 212364
+ www.tandfonline.com | no-capture | 148940
+ journals.sagepub.com | no-capture | 144695
+ www.papersearch.net | no-capture | 138986
+ absimage.aps.org | no-capture | 111976
+ apps.dtic.mil | no-capture | 106984
+ www.cambridge.org | no-capture | 97533
+ www.bmj.com | no-capture | 92437
+ bioone.org | no-capture | 87573
+ science.sciencemag.org | no-capture | 75723
+ shodhganga.inflibnet.ac.in:8080 | no-capture | 75395
+ www.jstor.org | no-capture | 73230
+ works.bepress.com | no-capture | 68747
+ www.scielo.org.co | no-capture | 59650
+ hrcak.srce.hr | no-capture | 59332
+ muse.jhu.edu | no-capture | 57828
+ onlinelibrary.wiley.com | no-capture | 55621
+ www.jbc.org | no-capture | 54608
+ www.jstage.jst.go.jp | no-capture | 53631
+ www.redalyc.org | no-capture | 50406
+ lup.lub.lu.se | no-capture | 47469
+ www.dtic.mil | no-capture | 41820
+ (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+ => COPY 11714199
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
+
+## Bulk Ingest of Heritrix Content
+
+Small sample:
+
+ head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ 2020-04-07 12:19 (pacific): 11,703,871
+
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+ status | count
+ -------------------------------+----------
+ success | 18491799
+ redirect-loop | 1968530
+ no-capture | 1373657
+ no-pdf-link | 1311842
+ link-loop | 1296439
+ terminal-bad-status | 627577
+ cdx-error | 418278
+ wrong-mimetype | 50141
+ wayback-error | 37159
+ petabox-error | 11249
+ null-body | 6295
+ gateway-timeout | 3051
+ spn2-cdx-lookup-failure | 328
+ spn2-error:invalid-url-syntax | 93
+ bad-redirect | 75
+ | 47
+ invalid-host-resolution | 28
+ spn2-error | 10
+ bad-gzip-encoding | 7
+ redirects-exceeded | 2
+ (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+ => 859849
+
+What domains are these?
+
+ cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+ cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+ wc -l mag_nocapture_20200420.rows.filtered.json
+ 423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+ cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Final Stats
+
+... for this round of ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+----------
+ success | 18712849
+ redirect-loop | 2008110
+ no-pdf-link | 1337012
+ link-loop | 1326761
+ no-capture | 1030693
+ terminal-bad-status | 637143
+ gateway-timeout | 193194
+ cdx-error | 125907
+ spn2-cdx-lookup-failure | 77842
+ wrong-mimetype | 50882
+ wayback-error | 40278
+ invalid-host-resolution | 35201
+ petabox-error | 11254
+ null-body | 6485
+ spn2-error | 1643
+ spn2-error:job-failed | 747
+ spn2-error:invalid-url-syntax | 325
+ spn2-error:soft-time-limit-exceeded | 190
+ bad-redirect | 77
+ | 47
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ domain | status | count
+ ---------------------------------+---------------------+--------
+ ieeexplore.ieee.org | redirect-loop | 677712
+ cyberleninka.ru | link-loop | 308390
+ papers.ssrn.com | link-loop | 281804
+ ieeexplore.ieee.org | link-loop | 273559
+ dialnet.unirioja.es | redirect-loop | 240504
+ dialnet.unirioja.es | terminal-bad-status | 232481
+ onlinelibrary.wiley.com | no-pdf-link | 220932
+ iopscience.iop.org | terminal-bad-status | 172480
+ validate.perfdrive.com | no-pdf-link | 172312
+ link.springer.com | redirect-loop | 130398
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382
+ iopscience.iop.org | redirect-loop | 105234
+ www.bmj.com | link-loop | 100354
+ www.researchgate.net | redirect-loop | 84366
+ www.cambridge.org | link-loop | 83171
+ jamanetwork.com | no-pdf-link | 75053
+ febs.onlinelibrary.wiley.com | no-pdf-link | 74872
+ www.jstor.org | redirect-loop | 72059
+ journals.sagepub.com | no-pdf-link | 63028
+ science.sciencemag.org | redirect-loop | 62927
+ profile.thieme.de | no-pdf-link | 62406
+ cyberleninka.ru | redirect-loop | 56733
+ link.springer.com | link-loop | 47608
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 30180
+ science.sciencemag.org | link-loop | 29908
+ papers.ssrn.com | redirect-loop | 27255
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789
+ www.computer.org | no-pdf-link | 26444
+ watermark.silverchair.com | terminal-bad-status | 25934
+ www.nature.com | redirect-loop | 25306
+ (30 rows)
diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md
new file mode 100644
index 0000000..fedaba0
--- /dev/null
+++ b/notes/ingest/2020-03_s2.md
@@ -0,0 +1,35 @@
+
+Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
+ingested, as well as any previous existing content.
+
+Also, there are a bunch of PDF outlinks to the web; should do S2-specific
+matching and ingest of those.
+
+There are a few categories of paper from pdfs.s.o:
+
+1. we had previous GWB crawl, didn't re-crawl
+2. we had PDF from elsewhere on the web, didn't re-crawl
+3. crawled successfully
+4. crawl failed
+
+In this ingest, want to get all of categories 1 and 3. Could try to do this by
+dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
+and join that against the ingest request list.
+
+For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
+
+## Create Seedlist
+
+ zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
+ zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
+
+ zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
+ zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
+
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
+
+ zcat s2_external_ingestrequest.json.gz | wc -l
+ 41201427
+ zcat s2_hosted_ingestrequest.json.gz | wc -l
+ 23345761
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md
new file mode 100644
index 0000000..b442d69
--- /dev/null
+++ b/notes/ingest/2020-04-13_covid19.md
@@ -0,0 +1,73 @@
+
+Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
+
+Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
+They are heterogenous after redirect.
+
+CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
+info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
+
+Need scope and identifiers for ingest requests. Let's do:
+
+ cnki_covid19 / <ident>
+ wanfang_covid19 / <ident>
+
+Source: scrape-covid19
+
+## Commands
+
+ # in sandcrawler pipenv
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
+
+
+ cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
+ cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
+
+## Status
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+2020-04-15:
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | spn2-cdx-lookup-failure | 1588
+ pdf | success | 671
+ pdf | gateway-timeout | 507
+ pdf | no-pdf-link | 181
+ pdf | wayback-error | 30
+ pdf | spn2-error:job-failed | 20
+ pdf | spn2-error | 7
+ pdf | spn2-error:soft-time-limit-exceeded | 3
+ pdf | spn2-error:pending | 2
+ (9 rows)
+
+## Re-Try
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status != 'no-pdf-link'
+ AND ingest_file_result.status != 'link-loop'
+ ) TO '/grande/snapshots/reingest_covid19.rows.json';
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
+
+ cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
+
diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md
new file mode 100644
index 0000000..0fc7e67
--- /dev/null
+++ b/notes/ingest/2020-04_datacite.md
@@ -0,0 +1,121 @@
+
+After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
+of the DOIs are for, eg, datasets, and don't want to waste time on those.
+
+Instead of using full ingest request file from the crawl, will generate a new
+ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
+
+## Generate Requests
+
+ ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
+ => Expecting 8905453 release objects in search queries
+ => 8.91M 11:49:50 [ 209 /s]
+ => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
+
+## Bulk Ingest
+
+ cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 4646767
+ redirect-loop | 1447229
+ no-capture | 860235
+ success | 849501
+ terminal-bad-status | 174869
+ cdx-error | 159805
+ wayback-error | 18076
+ wrong-mimetype | 11169
+ link-loop | 8410
+ gateway-timeout | 4034
+ spn2-cdx-lookup-failure | 510
+ petabox-error | 339
+ null-body | 251
+ spn2-error | 19
+ spn2-error:job-failed | 14
+ bad-gzip-encoding | 13
+ timeout | 5
+ spn2-error:soft-time-limit-exceeded | 4
+ invalid-host-resolution | 2
+ spn2-error:pending | 1
+ (20 rows)
+
+Top domains/statuses (including success):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------------+---------------------+--------
+ ssl.fao.org | no-pdf-link | 862277
+ www.e-periodica.ch | no-pdf-link | 746781
+ www.researchgate.net | redirect-loop | 664524
+ dlc.library.columbia.edu | no-pdf-link | 493111
+ www.die-bonn.de | redirect-loop | 352903
+ figshare.com | no-pdf-link | 319709
+ statisticaldatasets.data-planet.com | no-pdf-link | 309584
+ catalog.paradisec.org.au | redirect-loop | 225396
+ zenodo.org | no-capture | 193201
+ digi.ub.uni-heidelberg.de | no-pdf-link | 184974
+ open.library.ubc.ca | no-pdf-link | 167841
+ zenodo.org | no-pdf-link | 130617
+ www.google.com | no-pdf-link | 111312
+ www.e-manuscripta.ch | no-pdf-link | 79192
+ ds.iris.edu | no-pdf-link | 77649
+ data.inra.fr | no-pdf-link | 69440
+ www.tib.eu | no-pdf-link | 63872
+ www.egms.de | redirect-loop | 53877
+ archaeologydataservice.ac.uk | redirect-loop | 52838
+ d.lib.msu.edu | no-pdf-link | 45297
+ www.e-rara.ch | no-pdf-link | 45163
+ springernature.figshare.com | no-pdf-link | 42527
+ boris.unibe.ch | no-pdf-link | 40816
+ www.research-collection.ethz.ch | no-capture | 40350
+ spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059
+ repository.dri.ie | terminal-bad-status | 32760
+ othes.univie.ac.at | no-pdf-link | 32558
+ repositories.lib.utexas.edu | no-capture | 31526
+ posterng.netkey.at | no-pdf-link | 30315
+ zenodo.org | terminal-bad-status | 29614
+ (30 rows)
+
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..a5e3bb1
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,312 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+ => 24.7M 5:17:03 [ 1.3k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 24.7M
+ => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+ => 25.2M 3:16:28 [2.14k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+ => 3696189
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+ => 1799760
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+ => 2734145
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+ => 2602408
+
+NOTE: forgot here to transform from "rows" to ingest requests.
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+NOTE: if we *do* want to do cleanup eventually, could look for fatcat edits
+between 2020-04-01 and 2020-05-25 which have limited "extra" metadata (eg, no
+evidence or `oa_status`).
+
+The earlier bulk ingests were done wrong (forgot to transform from rows to full
+ingest request docs), so going to re-do those, which should be a superset of
+the nocapture crawl URLs.:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-04-08.json
+ => 1.26M 0:00:58 [21.5k/s]
+ => previously: 3,696,189
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-05-03.json
+ => 1.26M 0:00:56 [22.3k/s]
+
+Crap, looks like the 2020-04-08 segment got overwriten with 2020-05 data by
+accident. Hrm... need to re-ingest *all* recent unpaywall URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ ) TO '/grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json';
+ => COPY 5691106
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json
+ => 5.69M 0:04:26 [21.3k/s]
+
+Start small:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks good (whew), run the full thing:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Post-ingest stats (2020-08-28)
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 22063013
+ no-pdf-link | 2192606
+ redirect-loop | 1471135
+ terminal-bad-status | 995106
+ no-capture | 359440
+ cdx-error | 358909
+ wrong-mimetype | 111685
+ wayback-error | 50705
+ link-loop | 29359
+ null-body | 13667
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1007
+ redirects-exceeded | 747
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 415441
+ watermark.silverchair.com | terminal-bad-status | 345937
+ www.tandfonline.com | no-pdf-link | 262488
+ journals.sagepub.com | no-pdf-link | 235707
+ onlinelibrary.wiley.com | no-pdf-link | 225876
+ iopscience.iop.org | terminal-bad-status | 170783
+ www.nature.com | redirect-loop | 145522
+ www.degruyter.com | redirect-loop | 131898
+ files-journal-api.frontiersin.org | terminal-bad-status | 126091
+ pubs.acs.org | no-pdf-link | 119223
+ society.kisti.re.kr | no-pdf-link | 112401
+ www.ahajournals.org | no-pdf-link | 105953
+ dialnet.unirioja.es | terminal-bad-status | 96505
+ www.cell.com | redirect-loop | 87560
+ www.ncbi.nlm.nih.gov | redirect-loop | 49890
+ ageconsearch.umn.edu | redirect-loop | 45989
+ ashpublications.org | no-pdf-link | 45833
+ pure.mpg.de | redirect-loop | 45278
+ www.degruyter.com | terminal-bad-status | 43642
+ babel.hathitrust.org | terminal-bad-status | 42057
+ osf.io | redirect-loop | 41119
+ scialert.net | no-pdf-link | 39009
+ dialnet.unirioja.es | redirect-loop | 38839
+ www.jci.org | redirect-loop | 34209
+ www.spandidos-publications.com | redirect-loop | 33167
+ www.journal.csj.jp | no-pdf-link | 30915
+ journals.openedition.org | redirect-loop | 30409
+ www.valueinhealthjournal.com | redirect-loop | 30090
+ dergipark.org.tr | no-pdf-link | 29146
+ journals.ametsoc.org | no-pdf-link | 29133
+ (30 rows)
+
+Enqueue internal failures for re-ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/unpaywall_errors_2020-08-28.rows.json';
+ => 409606
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_errors_2020-08-28.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_errors_2020-08-28.requests.json
+
+ cat /grande/snapshots/unpaywall_errors_2020-08-28.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+And after *that* (which ran quickly):
+
+ status | count
+ -------------------------------------+----------
+ success | 22281874
+ no-pdf-link | 2258352
+ redirect-loop | 1499251
+ terminal-bad-status | 1004781
+ no-capture | 401333
+ wrong-mimetype | 112068
+ cdx-error | 32259
+ link-loop | 30137
+ null-body | 13886
+ wayback-error | 11653
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1036
+ redirects-exceeded | 749
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+22063013 -> 22281874 = + 218,861 success, not bad!
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..fe22c75
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,428 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';
+ => COPY 49491452
+
+ WARNING: should have transformed from rows to requests here
+
+ cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Crawl and re-ingest
+
+Updated stats after ingest (NOTE: ingest requests not really formed correctly,
+but doesn't matter because fatcat wasn't importing these anyways):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 42565875
+ success | 5227609
+ no-pdf-link | 2156341
+ redirect-loop | 559721
+ cdx-error | 260446
+ wrong-mimetype | 148871
+ terminal-bad-status | 109725
+ link-loop | 92792
+ null-body | 30688
+ | 15287
+ petabox-error | 11109
+ wayback-error | 6261
+ skip-url-blocklist | 184
+ gateway-timeout | 86
+ bad-gzip-encoding | 25
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ bad-redirect | 15
+ spn2-error | 4
+ spn2-error:job-failed | 2
+ (20 rows)
+
+Dump again for crawling:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error')
+ ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json';
+
+Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains:
+
+ 4876135 www.kb.dk REMOVE: too large and generic
+ 3110009 kb-images.kb.dk REMOVE: dead?
+ 1274638 mdz-nbn-resolving.de REMOVE: maybe broken
+ 982312 aggr.ukm.um.si REMOVE: maybe broken
+
+And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so
+expecting at least 11,052,439 `no-capture` ingest results (and should probably
+filter for these or even delete from the ingest request table).
+
+Ingest progress:
+
+ 2020-08-05 14:02: 32,571,018
+ 2020-08-06 13:49: 31,195,169
+ 2020-08-07 10:11: 29,986,169
+ 2020-08-10 10:43: 26,497,196
+ 2020-08-12 11:02: 23,811,845
+ 2020-08-17 13:34: 19,460,502
+ 2020-08-20 09:49: 15,069,507
+ 2020-08-25 09:56: 9,397,035
+ 2020-09-02 15:02: 305,889 (72k longest queue)
+ 2020-09-03 14:30: done
+
+## Post-ingest stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 16804277
+ no-pdf-link | 14895249
+ success | 13898603
+ redirect-loop | 2709730
+ cdx-error | 827024
+ terminal-bad-status | 740037
+ wrong-mimetype | 604242
+ link-loop | 532553
+ null-body | 95721
+ wayback-error | 41864
+ petabox-error | 19204
+ | 15287
+ gateway-timeout | 510
+ bad-redirect | 318
+ skip-url-blocklist | 184
+ bad-gzip-encoding | 114
+ timeout | 78
+ spn2-cdx-lookup-failure | 59
+ invalid-host-resolution | 19
+ blocked-cookie | 6
+ (20 rows)
+
+Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth
+dumping the full kafka result topic, filter to OAI requests, and extracting the
+missing URLs.
+
+Top counts by OAI prefix:
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 25;
+
+ oai_prefix | success | total
+ --------------------------+---------+---------
+ kb.dk | 0 | 7989412 (excluded)
+ repec | 1118591 | 2783448
+ bnf.fr | 0 | 2187277
+ hispana.mcu.es | 19404 | 1492639
+ bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?)
+ hal | 564700 | 1049607
+ ukm.si | 0 | 982468 (excluded)
+ hsp.org | 0 | 810281
+ www.irgrid.ac.cn | 17578 | 748828
+ cds.cern.ch | 72811 | 688091
+ americanae.aecid.es | 69678 | 572792
+ biodiversitylibrary.org | 2121 | 566154
+ juser.fz-juelich.de | 22777 | 518551
+ espace.library.uq.edu.au | 6494 | 508960
+ igi.indrastra.com | 58689 | 478577
+ archive.ugent.be | 63654 | 424014
+ hrcak.srce.hr | 395031 | 414897
+ zir.nsk.hr | 153889 | 397200
+ renati.sunedu.gob.pe | 78399 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7963 | 354529
+ generic.eprints.org | 261221 | 340470
+ invenio.nusl.cz | 6184 | 325867
+ evastar-karlsruhe.de | 62044 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ (25 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ oai_prefix | status | count
+ --------------------------+---------------+---------
+ kb.dk | no-capture | 7955231 (excluded)
+ bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?)
+ repec | success | 1118591
+ hispana.mcu.es | no-pdf-link | 1118092
+ bnf.fr | no-capture | 1100591
+ ukm.si | no-capture | 976004 (excluded)
+ hsp.org | no-pdf-link | 773496
+ repec | no-pdf-link | 625629
+ bnf.fr | no-pdf-link | 607813
+ hal | success | 564700
+ biodiversitylibrary.org | no-pdf-link | 531409
+ cds.cern.ch | no-capture | 529842
+ repec | redirect-loop | 504393
+ juser.fz-juelich.de | no-pdf-link | 468813
+ bnf.fr | redirect-loop | 436087
+ americanae.aecid.es | no-pdf-link | 409954
+ hrcak.srce.hr | success | 395031
+ www.irgrid.ac.cn | no-pdf-link | 362087
+ hal | no-pdf-link | 352111
+ www.irgrid.ac.cn | no-capture | 346963
+ espace.library.uq.edu.au | no-pdf-link | 315302
+ igi.indrastra.com | no-pdf-link | 312087
+ repec | no-capture | 309882
+ invenio.nusl.cz | no-pdf-link | 302657
+ hypotheses.org | no-pdf-link | 298750
+ rour.neicon.ru | redirect-loop | 291922
+ renati.sunedu.gob.pe | no-capture | 276388
+ t2r2.star.titech.ac.jp | no-pdf-link | 264109
+ generic.eprints.org | success | 261221
+ quod.lib.umich.edu | no-pdf-link | 253937
+ (30 rows)
+
+If we remove excluded prefixes, and some large/generic prefixes (bnf.fr,
+hispana.mcu.es, hsp.org), then the aggregate counts are:
+
+ no-capture | 16,804,277 -> 5,502,242
+ no-pdf-link | 14,895,249 -> 12,395,848
+
+Top status by terminal domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ----------------------------------+---------------+--------
+ hispana.mcu.es | no-pdf-link | 709701 (national scope)
+ gallica.bnf.fr | no-pdf-link | 601193 (national scope)
+ discover.hsp.org | no-pdf-link | 524212 (historical)
+ www.biodiversitylibrary.org | no-pdf-link | 479288
+ gallica.bnf.fr | redirect-loop | 435981 (national scope)
+ hrcak.srce.hr | success | 389673
+ hemerotecadigital.bne.es | no-pdf-link | 359243
+ juser.fz-juelich.de | no-pdf-link | 345112
+ espace.library.uq.edu.au | no-pdf-link | 304299
+ invenio.nusl.cz | no-pdf-link | 302586
+ igi.indrastra.com | no-pdf-link | 292006
+ openrepository.ru | redirect-loop | 291555
+ hal.archives-ouvertes.fr | success | 278134
+ t2r2.star.titech.ac.jp | no-pdf-link | 263971
+ bib-pubdb1.desy.de | no-pdf-link | 254879
+ quod.lib.umich.edu | no-pdf-link | 250382
+ encounters.hsp.org | no-pdf-link | 248132
+ americanae.aecid.es | no-pdf-link | 245295
+ www.irgrid.ac.cn | no-pdf-link | 242496
+ publikationen.bibliothek.kit.edu | no-pdf-link | 222041
+ www.sciencedirect.com | no-pdf-link | 211756
+ dialnet.unirioja.es | redirect-loop | 203615
+ edoc.mpg.de | no-pdf-link | 195526
+ bibliotecadigital.jcyl.es | no-pdf-link | 184671
+ hal.archives-ouvertes.fr | no-pdf-link | 183809
+ www.sciencedirect.com | redirect-loop | 173439
+ lup.lub.lu.se | no-pdf-link | 165788
+ orbi.uliege.be | no-pdf-link | 158313
+ www.erudit.org | success | 155986
+ lib.dr.iastate.edu | success | 153384
+ (30 rows)
+
+Follow-ups are TBD but could include:
+- crawling the ~5m no-capture links directly (eg, not `base_url`) from the
+ ingest result JSON, while retaining the ingest request for later re-ingest
+- investigating and iterating on PDF link extraction, both for large platforms
+ and randomly sampled from long tail
+- classifying OAI prefixes by type (subject repository, institutional
+ repository, journal, national-library, historical docs, greylit, law, etc)
+- running pdftrio over some/all of this corpus
diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md
new file mode 100644
index 0000000..36d00a1
--- /dev/null
+++ b/notes/ingest/2020-05_pubmed.md
@@ -0,0 +1,10 @@
+
+From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1.
+
+Test small batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md
new file mode 100644
index 0000000..1d33162
--- /dev/null
+++ b/notes/ingest/2020-07_mag.md
@@ -0,0 +1,353 @@
+
+Using 2020-06-25 upstream MAG corpus snapshot.
+
+Ran munging from `scratch:ingest/mag` notes first.
+
+Expecting a couple million new ingest request URLs; this is the first "patch"
+MAG ingest on top of existing already-run requests.
+
+Planning to skip the initial bulk ingest step, on the assumption that new URLs
+have either been ingested already (eg, via continuous ingest pipeline) or need
+crawling.
+
+## Generate Requests
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json
+ => 28.7M 2:36:48 [3.06k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json
+ => 5.66M 0:29:28 [ 3.2k/s]
+
+## Persist Ingest Requests
+
+ # small sample
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0})
+
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0})
+
+## Crawl/Dupe Status
+
+Overall status for old and new seeds, filtering out large (blocking)
+publishers:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 19477651
+ | 8238898
+ redirect-loop | 2036494
+ link-loop | 1330036
+ no-pdf-link | 1304820
+ terminal-bad-status | 648150
+ no-capture | 545785
+ gateway-timeout | 200143
+ cdx-error | 149995
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 57052
+ wayback-error | 41032
+ invalid-host-resolution | 37203
+ petabox-error | 11167
+ null-body | 6662
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 77
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ | 8238851
+ success | 787174
+ no-capture | 42864
+ redirect-loop | 31718
+ terminal-bad-status | 31493
+ no-pdf-link | 13025
+ cdx-error | 11275
+ wrong-mimetype | 6238
+ link-loop | 3365
+ wayback-error | 748
+ gateway-timeout | 506
+ null-body | 191
+ spn2-cdx-lookup-failure | 99
+ petabox-error | 89
+ invalid-host-resolution | 70
+ spn2-error | 7
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ bad-gzip-encoding | 1
+ (19 rows)
+
+Where are no-capture results terminating? May need to add or update heritrix
+crawl config so that we get better yield without needing to do SPNv2 crawling.
+
+ SELECT initial_domain, terminal_domain, COUNT(*)
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ GROUP BY initial_domain, terminal_domain
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ initial_domain | terminal_domain | count
+ ---------------------------------+---------------------+--------
+ www.researchgate.net | | 334145
+ academic.oup.com | | 205820
+ www.tandfonline.com | | 148638
+ journals.sagepub.com | | 144196
+ muse.jhu.edu | | 55957
+ hrcak.srce.hr | | 25317
+ www.omicsonline.org | | 22426
+ link.springer.com | | 21044
+ iopscience.iop.org | | 12385
+ bioone.org | | 9097
+ tandfonline.com | | 8512
+ or.nsfc.gov.cn | | 4823
+ ieeexplore.ieee.org | ieeexplore.ieee.org | 4398
+ pubs.acs.org | | 3708
+ archive-ouverte.unige.ch | | 2743
+ dergipark.ulakbim.gov.tr | | 2677
+ hal.archives-ouvertes.fr | | 1258
+ dergipark.org.tr | | 1207
+ apo.org.au | | 1186
+ spire.sciencespo.fr | | 989
+ cyberleninka.ru | | 895
+ lirias.kuleuven.be | | 855
+ tel.archives-ouvertes.fr | | 786
+ pub.uni-bielefeld.de | | 728
+ www.research-collection.ethz.ch | | 670
+ (25 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status IS NULL)
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json';
+ => 8784683
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json
+
+Seedlist transform from here on covered in MAG crawl notes.
+
+## Bulk Ingest
+
+Run ingest requests on everything we crawled:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Small sample:
+
+ head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Updated Overall Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24574294
+ redirect-loop | 2633731
+ no-capture | 2458694
+ no-pdf-link | 1896871
+ link-loop | 1510899
+ terminal-bad-status | 878821
+ cdx-error | 387574
+ gateway-timeout | 200246
+ | 170304
+ wayback-error | 97572
+ spn2-cdx-lookup-failure | 80284
+ wrong-mimetype | 65097
+ invalid-host-resolution | 37204
+ petabox-error | 12097
+ null-body | 8549
+ spn2-error | 1706
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ (20 rows)
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24557382
+ redirect-loop | 2630582
+ no-capture | 1947066
+ no-pdf-link | 1778206
+ link-loop | 1510790
+ terminal-bad-status | 857173
+ cdx-error | 384525
+ gateway-timeout | 200143
+ wayback-error | 96390
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 64908
+ invalid-host-resolution | 37203
+ petabox-error | 12087
+ null-body | 8548
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ | 69
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+---------
+ success | 5860601
+ no-capture | 1489959
+ redirect-loop | 619121
+ no-pdf-link | 473703
+ terminal-bad-status | 234753
+ cdx-error | 231575
+ link-loop | 184093
+ wayback-error | 56068
+ wrong-mimetype | 14046
+ null-body | 2068
+ petabox-error | 1006
+ gateway-timeout | 506
+ spn2-cdx-lookup-failure | 99
+ invalid-host-resolution | 70
+ | 22
+ bad-redirect | 13
+ spn2-error | 7
+ timeout | 3
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ (20 rows)
+
diff --git a/notes/ingest/2020-08_daily_improvements.md b/notes/ingest/2020-08_daily_improvements.md
new file mode 100644
index 0000000..da57065
--- /dev/null
+++ b/notes/ingest/2020-08_daily_improvements.md
@@ -0,0 +1,202 @@
+
+Goal is to increase rate of successful daily changelog crawling, but reduce
+wasted attempts.
+
+Status by domain, past 30 days:
+
+ domain | status | count
+ --------------------------------------+-----------------+-------
+ arxiv.org | success | 21792
+ zenodo.org | success | 10646
+ res.mdpi.com | success | 10449
+ springernature.figshare.com | no-pdf-link | 10430
+ s3-eu-west-1.amazonaws.com | success | 8966
+ zenodo.org | no-pdf-link | 8137
+ hkvalidate.perfdrive.com | no-pdf-link | 5943
+ www.ams.org:80 | no-pdf-link | 5799
+ assets.researchsquare.com | success | 4651
+ pdf.sciencedirectassets.com | success | 4145
+ fjfsdata01prod.blob.core.windows.net | success | 3500
+ sage.figshare.com | no-pdf-link | 3174
+ onlinelibrary.wiley.com | no-pdf-link | 2869
+ www.e-periodica.ch | no-pdf-link | 2709
+ revistas.uned.es | success | 2631
+ figshare.com | no-pdf-link | 2500
+ www.sciencedirect.com | link-loop | 2477
+ linkinghub.elsevier.com | gateway-timeout | 1878
+ downloads.hindawi.com | success | 1819
+ www.scielo.br | success | 1691
+ jps.library.utoronto.ca | success | 1590
+ www.ams.org | no-pdf-link | 1568
+ digi.ub.uni-heidelberg.de | no-pdf-link | 1496
+ research-repository.griffith.edu.au | success | 1412
+ journals.plos.org | success | 1330
+ (25 rows)
+
+Status by DOI prefix, past 30 days:
+
+ doi_prefix | status | count
+ ------------+-------------------------+-------
+ 10.6084 | no-pdf-link | 14410 <- figshare; small fraction success
+ 10.6084 | success | 4007
+ 10.6084 | cdx-error | 1746
+
+ 10.13140 | gateway-timeout | 9689 <- researchgate
+ 10.13140 | cdx-error | 4154
+
+ 10.5281 | success | 9408 <- zenodo
+ 10.5281 | no-pdf-link | 6079
+ 10.5281 | cdx-error | 3200
+ 10.5281 | wayback-error | 2098
+
+ 10.1090 | no-pdf-link | 7420 <- AMS (ams.org)
+
+ 10.3390 | success | 6599 <- MDPI
+ 10.3390 | cdx-error | 3032
+ 10.3390 | wayback-error | 1636
+
+ 10.1088 | no-pdf-link | 3227 <- IOP science
+
+ 10.1101 | gateway-timeout | 3168 <- coldspring harbor: press, biorxiv, medrxiv, etc
+ 10.1101 | cdx-error | 1147
+
+ 10.21203 | success | 3124 <- researchsquare
+ 10.21203 | cdx-error | 1181
+
+ 10.1016 | success | 3083 <- elsevier
+ 10.1016 | cdx-error | 2465
+ 10.1016 | gateway-timeout | 1682
+ 10.1016 | wayback-error | 1567
+
+ 10.25384 | no-pdf-link | 3058 <- sage figshare
+ 10.25384 | success | 2456
+
+ 10.1007 | gateway-timeout | 2913 <- springer
+ 10.1007 | cdx-error | 1164
+
+ 10.5944 | success | 2831
+ 10.1186 | success | 2650
+ 10.5169 | no-pdf-link | 2644 <- www.e-periodica.ch
+ 10.3389 | success | 2279
+ 10.24411 | gateway-timeout | 2184 <- cyberleninka.ru
+ 10.1038 | gateway-timeout | 2143 <- nature group
+ 10.1177 | gateway-timeout | 2038 <- SAGE
+ 10.11588 | no-pdf-link | 1574 <- journals.ub.uni-heidelberg.de (OJS?)
+ 10.25904 | success | 1416
+ 10.1155 | success | 1304
+ 10.21994 | no-pdf-link | 1268 <- loar.kb.dk
+ 10.18720 | spn2-cdx-lookup-failure | 1232 <- elib.spbstu.ru
+ 10.24411 | cdx-error | 1202
+ 10.1055 | no-pdf-link | 1170 <- thieme-connect.de
+ (40 rows)
+
+code changes for ingest:
+x hkvalidate.perfdrive.com: just bail when we see this
+x skip large publishers which gateway-timeout (for now)
+ - springerlink (10.1007)
+ - nature group (10.1038)
+ - SAGE (10.1177)
+ - IOP (10.1088)
+
+fatcat:
+x figshare (by `doi_prefix`): if not versioned (suffix), skip crawl
+x zenodo: also try to not crawl if unversioned (group)
+x figshare import metadata
+
+sandcrawler:
+x ends with `cookieAbsent` or `cookieSet=1` -> status as cookie-blocked
+x https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+x verify that we do quick-get for arxiv.org + europmc.org (+ figshare/zenodo?)
+ => we were not!
+x shorten post-SPNv2 CDX pause? for throughput, given that we are re-trying anyways
+x ensure that we store uncrawled URL somewhere on no-capture status
+ => in HTML or last of hops
+ => not in DB, but that is a bigger change
+
+- try to get un-blocked:
+ - coldspring harbor has been blocking since 2020-06-22? yikes!
+ - cyberleninka.ru
+ - arxiv.org
+
+- no-pdf-link
+ x www.ams.org (10.1090)
+ => these seem to be stale captures, eg from 2008. newer captures have citation_pdf_url
+ => should consider recrawling all of ams.org?
+ => not sure why these crawl requests are happening only now
+ => on the order of 15k OA articles not in ia; 43k total not preserved
+ => force recrawl OA subset (DONE)
+ x www.e-periodica.ch (10.5169)
+ => TODO: dump un-preserved URLs, transform to PDF urls, heritrix crawl, re-ingest
+ x digi.ub.uni-heidelberg.de (10.11588)
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ - https://loar.kb.dk/handle/1902/6988 (10.21994)
+ => TODO: bulk re-enqueue
+ => site was updated recently (august 2020); now it crawls fine. need to re-ingest all?
+ => 7433 hits
+ - thieme-connect.de (10.1055)
+ => 600k+ missing
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ => https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+ => generally just need to re-crawl all?
+
+Unresolved:
+- why so many spn2-errors on https://elib.spbstu.ru/ (10.18720)?
+
+## figshare
+
+10.6084 regular figshare
+10.25384 SAGE figshare
+
+For sage, "collections" are bogus? can we detect these in datacite metadata?
+
+If figshare types like:
+
+ ris: "GEN",
+ bibtex: "misc",
+ citeproc: "article",
+ schemaOrg: "Collection",
+ resourceType: "Collection",
+ resourceTypeGeneral: "Collection"
+
+then mark as 'stub'.
+
+"Additional file" items don't seem like "stub"; -> "component".
+
+title:"Figure {} from " -> component
+
+current types are mostly: article, stub, dataset, graphic, article-journal
+
+If DOI starts with "sage.", then publisher is "Sage" (not figshare). Container
+name should be... sage.figshare.com?
+
+set version to the version from DOI
+
+## zenodo
+
+doi_prefix: 10.5281
+
+if on zenodo, and has a "Identical to" relation, then this is a pre-print. in
+that case, drop container_id and set container_name to zenodo.org. *But*, there
+are some journals now publishing exclusively to zenodo.org, so retain that
+metadata. examples:
+
+ "Detection of keyboard vibrations and effects on perceived piano quality"
+ https://fatcat.wiki/release/mufzkdgt2nbzfha44o7p7gkrpy
+
+ "Editing LAF: Educate, don't defend!"
+ https://zenodo.org/record/2583025
+
+version number not available in zenodo metadata
+
+## Gitlab MR Notes
+
+The main goal of this group of changes is to do a better job at daily ingest.
+
+Currently we have on the order of 20k new releases added to the index every day, and about half of them get are marked as OA (either CC license or via container being in DOAJ or ROAD), and pass some filters (eg, release_type), and are selected for ingest. Of those, about half fail to crawl to fulltext, either due to blocking (gateway-timeout, cookie tests, anti-bot detection, loginwall, etc). On the other hand, we don't attempt to crawl lots of "bronze" OA, which is content that is available from the publisher website, but isn't marked explicitly OA.
+
+Based on investigating daily crawling from the past month (will commit these notes to sandcrawler soon), I have identified some DOI prefixes that almost always fail ingest via SPNv2. I also have some patches to sandcrawler ingest to improve ability to crawl some large repositories etc.
+
+Some of the biggest "OA but failed to crawl" are from figshare and zenodo, which register a relatively large fraction of daily OA DOIs. We want to crawl most of that content, but both of these platforms register at least DOIs for each piece of content (a "group" DOI and a "versioned" DOI), and we only need to crawl one. There were also some changes needed to release-type filtering and assignment specific to these platforms, or based on the title of entities.
+
+This MR mixes changes to the datacite metadata import routing (including some refactors out of the main parse_record method) and behavior changes to the entity updater (which is where the code to decide about whether to send an ingest request on release creation lives). I will have a separate MR for importer metadata changes that don't impact ingest behavior.
+
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md
new file mode 100644
index 0000000..f5c853d
--- /dev/null
+++ b/notes/ingest/2020-09_oa_doi.md
@@ -0,0 +1,352 @@
+
+It seems that many gold OA DOIs on were not ingesting simply because the HTML
+url extraction was not working for a particular version of OJS.
+
+Let's re-try all ~2.5 million of these in bulk mode and see how many are
+'no-capture' vs. other errors, then possibly re-crawl a large number.
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json
+ Expecting 2569876 release objects in search queries
+ Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034})
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started at about:
+
+ Thu Sep 17 00:15:00 UTC 2020
+ 2020-09-17T00:15:00Z
+
+## Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND ingest_file_result.updated >= '2020-09-16'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 513462
+ success | 206042
+ no-pdf-link | 186779
+ terminal-bad-status | 40372
+ redirect-loop | 33103
+ cdx-error | 24078
+ link-loop | 13494
+ spn2-cdx-lookup-failure | 10247
+ gateway-timeout | 4407
+ wrong-mimetype | 3213
+ petabox-error | 866
+ null-body | 449
+ spn2-error | 217
+ wayback-error | 129
+ spn2-error:job-failed | 64
+ bad-redirect | 6
+ spn2-error:soft-time-limit-exceeded | 1
+ (17 rows)
+
+This was only about half the requests. Try... broader?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 579952
+ success | 387325
+ no-pdf-link | 380406
+ terminal-bad-status | 63743
+ redirect-loop | 53893
+ cdx-error | 46024
+ spn2-cdx-lookup-failure | 28347
+ link-loop | 22573
+ gateway-timeout | 11686
+ wrong-mimetype | 6294
+ null-body | 3509
+ petabox-error | 2388
+ spn2-error | 1023
+ spn2-error:job-failed | 462
+ wayback-error | 347
+ spn2-error:soft-time-limit-exceeded | 20
+ bad-redirect | 11
+ (17 rows)
+
+What top domains for those `no-pdf-link` (or similar)?
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 56488
+ figshare.com | no-pdf-link | 55337
+ www.egms.de | redirect-loop | 22686
+ zenodo.org | terminal-bad-status | 22128
+ tandf.figshare.com | no-pdf-link | 20027
+ springernature.figshare.com | no-pdf-link | 17181
+ cairn.info | terminal-bad-status | 13836
+ www.persee.fr | terminal-bad-status | 7565
+ projecteuclid.org | link-loop | 7449
+ www.cairn.info | no-pdf-link | 6992
+ scialert.net | no-pdf-link | 6621
+ www.cairn.info | link-loop | 5870
+ utpjournals.press | no-pdf-link | 5772
+ journals.openedition.org | redirect-loop | 5464
+ www.egms.de | no-pdf-link | 5223
+ archaeologydataservice.ac.uk | no-pdf-link | 4881
+ rs.figshare.com | no-pdf-link | 4773
+ www.degruyter.com | spn2-cdx-lookup-failure | 4763
+ koreascience.or.kr | no-pdf-link | 4487
+ cancerres.aacrjournals.org | no-pdf-link | 4124
+ cms.math.ca | no-pdf-link | 3441
+ volcano.si.edu | no-pdf-link | 3424
+ www.mathnet.ru | no-pdf-link | 3229
+ tidsskriftet.no | no-pdf-link | 3012
+ journals.plos.org | no-pdf-link | 3005
+ tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796
+ www.cairn.info:80 | link-loop | 2647
+ hammer.figshare.com | no-pdf-link | 2627
+ www.psychosocial.com | no-pdf-link | 2457
+ osf.io | terminal-bad-status | 2388
+ (30 rows)
+
+Should look at link extraction for:
+
+- scialert.net
+- utpjournals.press
+- koreascience.or.kr
+- cancerres.aacrjournals.org
+- cms.math.ca
+- volcano.si.edu
+- www.mathnet.ru
+- www.psychosocial.com
+
+## Re-Ingest
+
+Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ AND ingest_file_result.status = 'no-capture'
+ -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json';
+ => COPY 579952
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json
+ => 579k 0:00:22 [25.9k/s]
+
+ cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Resuming progress on this in early December 2020.
+
+Filtered requests to re-crawl:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20')
+ OR (ingest_file_result.updated >= '2020-10-11'))
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json';
+ => COPY 2352614
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+ wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt
+ 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+Top DOI prefixes (same old usual suspects):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20
+ 353695 10.5281 zenodo.org
+ 121888 10.6084 figshare.org
+ 115093 10.3917 cairn.info
+ 113252 10.3406 persee.fr
+ 95414 10.1515 degruyter.com
+ 90448 10.4324 taylorfrancis.com
+ 83927 10.1016 elsevier
+ 60303 10.1109 IEEE
+ 48490 10.4000 openedition.org
+ 28498 10.3205 egms.de
+ 23433 10.1163 brill.com
+ 23276 10.17615 cdr.lib.unc.edu
+ 21386 10.1093 oup.com
+ 20783 10.3138 utpjournals.press
+ 19987 10.1201 tandfonline.com
+ 17916 10.34847 cocoon.huma-num.fr
+ 16970 10.1002 wiley.com
+ 15958 10.1097 lww.com (and others?)
+ 15835 10.1017 cambridge.org
+ 15466 10.24355 publikationsserver.tu-braunschweig.de (IR)
+
+Top domains (not doi.org):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 104148 zenodo.org
+ 85245 www.persee.fr
+ 52931 www.cairn.info
+ 4791 www.jstage.jst.go.jp
+ 4411 archive.monthlyreview.org
+ 4129 osf.io
+ 2841 www.indianjournals.com
+ 2746 www.impan.pl
+ 2620 platform.almanhal.com
+ 2019 www.nomos-elibrary.de
+ 1209 dergipark.org.tr
+ 1027 pubs.geoscienceworld.org
+ 973 www.pdcnet.org
+ 923 www.hanspub.org
+ 914 www.repository.cam.ac.uk
+ 863 mediarep.org
+ 812 www.cartographicperspectives.org
+ 687 www.degruyter.com
+ 578 192.168.7.24
+ 566 journals.eco-vector.com
+
+TODO: infer `publisher_type` and platform from DOI prefix in more cases
+
+## Re-Ingest
+
+Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3
+million requests. Note these are all `pdf` requests, but crawl was done in an
+HTML-friendly way, so should be able to do domain/journal-specific HTML ingests
+in the future.
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Stats, for this ingest period (fuzzy; will have some daily ingest stuff):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-12-28'
+ AND ingest_request.created <= '2020-12-09'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -----------------------+--------
+ no-pdf-link | 962714
+ success | 539305
+ no-capture | 306590
+ redirect-loop | 192149
+ link-loop | 184797
+ terminal-bad-status | 141721
+ wrong-mimetype | 10362
+ null-body | 10277
+ skip-url-blocklist | 1985
+ wayback-content-error | 1300
+ cdx-error | 869
+ petabox-error | 160
+ bad-redirect | 72
+ wayback-error | 46
+ bad-gzip-encoding | 7
+ timeout | 1
+ max-hops-exceeded | 1
+ (17 rows)
+
diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md
new file mode 100644
index 0000000..ec4e536
--- /dev/null
+++ b/notes/ingest/2020-09_reingest.md
@@ -0,0 +1,197 @@
+
+Goal: re-bulk-ingest some older existing crawls which hung on errors like
+`cdx-error` or `wayback-error`, indicating that ingest might actually succeed
+on retry.
+
+Sources:
+- unpaywall (again)
+- doi (ingest, changelog, etc)
+- mag
+- oai
+
+## DOI
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 8304582
+ success | 3461708
+ no-capture | 1881269
+ redirect-loop | 1851541
+ gateway-timeout | 355820
+ cdx-error | 341848
+ terminal-bad-status | 328650
+ skip-url-blocklist | 220474
+ spn2-cdx-lookup-failure | 125521
+ link-loop | 109352
+ wayback-error | 101525
+ null-body | 73539
+ wrong-mimetype | 53151
+ spn-error | 13579
+ spn2-error | 6848
+ spn2-error:job-failed | 4381
+ spn-remote-error | 4180
+ other-mimetype | 2305
+ petabox-error | 904
+ timeout | 710
+ spn2-error:soft-time-limit-exceeded | 557
+ spn2-error:proxy-error | 437
+ spn2-error:browser-running-error | 273
+ invalid-host-resolution | 233
+ pending | 116
+ (25 rows)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json';
+ => 443421
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+Additional 27,779 success status? Hard to tell because lots of other ingest
+running in parallel.
+
+Live:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'spn-error' OR
+ ingest_file_result.status = 'spn2-cdx-lookup-failure' OR
+ ingest_file_result.status = 'spn2-error:job-failed' OR
+ ingest_file_result.status = 'spn2-error:proxy-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json';
+ => 143984
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Unpaywall (again)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json';
+ => 43912
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## MAG
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json';
+ => 188175
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## OAI-PMH
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json';
+ => 851056
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+---------
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+
diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md
new file mode 100644
index 0000000..4ec6fbd
--- /dev/null
+++ b/notes/ingest/2020-09_scielo.md
@@ -0,0 +1,21 @@
+
+As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing
+fatcat releases with no IA copy and with `publisher_type:scielo`. There are
+200k+ such releases.
+
+It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008
+
+Could try XML ingest of these!
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json
+ Expecting 212529 release objects in search queries
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done 2020-09-14
+
diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md
new file mode 100644
index 0000000..d2bb50b
--- /dev/null
+++ b/notes/ingest/2020-10_daily.md
@@ -0,0 +1,193 @@
+
+Quick notes on how daily ingest is going, circa September/October 2020.
+
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+-------+---------
+ pdf | 2020-10-10 | 6145 | 1368
+ pdf | 2020-10-09 | 28453 | 6461
+ pdf | 2020-10-08 | 15105 | 3803
+ pdf | 2020-10-07 | 34213 | 10813
+ pdf | 2020-10-06 | 22263 | 8565
+ pdf | 2020-10-05 | 7910 | 3200
+ pdf | 2020-10-04 | 10865 | 4579
+ pdf | 2020-10-03 | 27745 | 10818
+ pdf | 2020-10-02 | 34320 | 13523
+ pdf | 2020-10-01 | 32548 | 13252
+ pdf | 2020-09-30 | 34798 | 14113
+ pdf | 2020-09-29 | 22463 | 8328
+ pdf | 2020-09-28 | 4117 | 1278
+ pdf | 2020-09-27 | 5894 | 1732
+ pdf | 2020-09-26 | 34949 | 13901
+ pdf | 2020-09-25 | 33680 | 10605
+ pdf | 2020-09-24 | 15125 | 5785
+ pdf | 2020-09-23 | 20866 | 6584
+ pdf | 2020-09-22 | 20949 | 7167
+ pdf | 2020-09-21 | 22483 | 7308
+ pdf | 2020-09-20 | 45644 | 16981
+ pdf | 2020-09-19 | 95571 | 31991
+ pdf | 2020-09-18 | 50849 | 15875
+ pdf | 2020-09-17 | 20121 | 3158
+ pdf | 2020-09-16 | 39184 | 12150
+ pdf | 2020-09-15 | 16986 | 7705
+ (26 rows)
+
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+--------
+ pdf | success | 241047
+ pdf | no-pdf-link | 143084
+ pdf | spn2-cdx-lookup-failure | 108311
+ pdf | gateway-timeout | 97250
+ pdf | cdx-error | 61820
+ pdf | link-loop | 31350
+ pdf | wayback-error | 9139
+ pdf | spn2-error:job-failed | 4240
+ pdf | spn2-error | 3893
+ pdf | wrong-mimetype | 1010
+ pdf | no-capture | 851
+ pdf | null-body | 605
+ pdf | redirect-loop | 261
+ pdf | spn2-error:soft-time-limit-exceeded | 126
+ pdf | terminal-bad-status | 120
+ pdf | petabox-error | 105
+ pdf | timeout | 29
+ pdf | spn2-error:no-status | 2
+ pdf | spn2-error:invalid-server-response | 2
+ pdf | bad-gzip-encoding | 1
+ (20 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 52767
+ www.degruyter.com | link-loop | 17666
+ www.degruyter.com | spn2-cdx-lookup-failure | 17597
+ ieeexplore.ieee.org | gateway-timeout | 15290
+ www.sciencedirect.com | no-pdf-link | 14043
+ apps.crossref.org | no-pdf-link | 11531
+ figshare.com | no-pdf-link | 8966
+ tandf.figshare.com | no-pdf-link | 7276
+ zenodo.org | no-capture | 7191
+ springernature.figshare.com | no-pdf-link | 6485
+ www.taylorfrancis.com | link-loop | 6266
+ www.persee.fr | terminal-bad-status | 6031
+ journals.openedition.org | gateway-timeout | 5639
+ www.cairn.info | link-loop | 5618
+ archaeologydataservice.ac.uk | no-pdf-link | 5359
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748
+ www.e-periodica.ch | no-pdf-link | 4722
+ osf.io | no-capture | 4247
+ cancerres.aacrjournals.org | no-pdf-link | 4136
+ dlc.library.columbia.edu | no-pdf-link | 4085
+ www.egms.de | no-pdf-link | 3304
+ journals.lww.com | no-pdf-link | 3218
+ journals.plos.org | no-pdf-link | 3005
+ linkinghub.elsevier.com | gateway-timeout | 2833
+ www.egms.de | redirect-loop | 2606
+ (25 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------------------+---------+-------
+ zenodo.org | success | 55549
+ arxiv.org | success | 24450
+ s3-eu-west-1.amazonaws.com | success | 18156
+ res.mdpi.com | success | 13493
+ www.degruyter.com | success | 12009
+ journals.openedition.org | success | 11235
+ www.jstage.jst.go.jp | success | 9460
+ peer.asee.org | success | 9416
+ www.e-periodica.ch | success | 8105
+ ir.canterbury.ac.nz | success | 6381
+ europepmc.org | success | 5670
+ www.repository.cam.ac.uk | success | 4858
+ assets.researchsquare.com | success | 4765
+ fjfsdata01prod.blob.core.windows.net | success | 4130
+ tidsskrift.dk | success | 3964
+ research-journal.org | success | 3127
+ ieeexplore.ieee.org | success | 2947
+ dergipark.org.tr | success | 2892
+ watermark.silverchair.com | success | 2315
+ journals.plos.org | success | 2304
+ journal.fi | success | 1996
+ publications.rwth-aachen.de | success | 1954
+ www.brazilianjournals.com | success | 1637
+ article.sciencepublishinggroup.com | success | 1589
+ revistas.upr.edu | success | 1467
+ (25 rows)
+
+Casual take-aways:
+- wonder what `apps.crossref.org` is
+- sciencedirect crawling broken?
+- figshare might be broken? or just very little success
+- seems like a lot of journals.plos.org failures
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md
new file mode 100644
index 0000000..a991025
--- /dev/null
+++ b/notes/ingest/2020-10_unpaywall.md
@@ -0,0 +1,286 @@
+
+New snapshot released 2020-10-09. Want to do a mostly straight-forward
+load/ingest/crawl.
+
+Proposed changes this time around:
+
+- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture`
+ status, and to include those URLs in heritrix3 crawl
+- tweak heritrix3 config for additional PDF URL extraction patterns,
+ particularly to improve OJS yield
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json
+ => 28.3M 3:19:03 [2.37k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 28.3M 1:11:29 [ 6.6k/s]
+ => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2020-10-09'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json';
+ => COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json
+ => 4.22M 0:02:48 [ 25k/s]
+
+Start small, to test no-capture behavior:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+`no-capture` change looks good. Enqueue the whole batch:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 23661282
+ no-capture | 3015447
+ no-pdf-link | 2302102
+ redirect-loop | 1542566
+ terminal-bad-status | 1044676
+ wrong-mimetype | 114315
+ link-loop | 36358
+ cdx-error | 20150
+ null-body | 14513
+ wayback-error | 13644
+ gateway-timeout | 3776
+ spn2-cdx-lookup-failure | 1260
+ petabox-error | 1171
+ redirects-exceeded | 752
+ invalid-host-resolution | 464
+ spn2-error | 147
+ bad-redirect | 131
+ spn2-error:job-failed | 91
+ wayback-content-error | 45
+ timeout | 19
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ ) t1
+ ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json';
+ => 2,936,404
+
+ # TODO: in the future also exclude "www.archive.org"
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt
+
+ wc -l unpaywall_seedlist_2020-11-02.*.txt
+ 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt
+ 2713866 unpaywall_seedlist_2020-11-02.url.txt
+
+With things like jsessionid, suspect that crawling just the terminal URLs is
+going to work better than both full and terminal.
+
+Finding a fraction of `no-capture` which have partial/stub URLs as terminal.
+
+TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP).
+
+
+## Bulk Ingest and Status
+
+Note, removing archive.org links:
+
+ cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Overall status (checked 2020-12-08):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 25004559
+ no-pdf-link | 2531841
+ redirect-loop | 1671375
+ terminal-bad-status | 1389463
+ no-capture | 893880
+ wrong-mimetype | 119332
+ link-loop | 66508
+ wayback-content-error | 30339
+ cdx-error | 21790
+ null-body | 20710
+ wayback-error | 13976
+ gateway-timeout | 3775
+ petabox-error | 2420
+ spn2-cdx-lookup-failure | 1218
+ redirects-exceeded | 889
+ invalid-host-resolution | 464
+ bad-redirect | 147
+ spn2-error | 112
+ spn2-error:job-failed | 91
+ timeout | 21
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1101090
+ accepted | no-pdf-link | 28590
+ accepted | redirect-loop | 10923
+ accepted | no-capture | 9540
+ accepted | terminal-bad-status | 6339
+ accepted | cdx-error | 952
+ accepted | wrong-mimetype | 447
+ accepted | link-loop | 275
+ accepted | wayback-error | 202
+ accepted | petabox-error | 177
+ accepted | redirects-exceeded | 122
+ accepted | null-body | 27
+ accepted | wayback-content-error | 14
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | bad-redirect | 1
+ published | success | 18595278
+ published | no-pdf-link | 2434935
+ published | redirect-loop | 1364110
+ published | terminal-bad-status | 1185328
+ published | no-capture | 718792
+ published | wrong-mimetype | 112923
+ published | link-loop | 63874
+ published | wayback-content-error | 30268
+ published | cdx-error | 17302
+ published | null-body | 15209
+ published | wayback-error | 10782
+ published | gateway-timeout | 1966
+ published | petabox-error | 1611
+ published | spn2-cdx-lookup-failure | 879
+ published | redirects-exceeded | 760
+ published | invalid-host-resolution | 453
+ published | bad-redirect | 115
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 75
+ published | timeout | 21
+ published | bad-gzip-encoding | 5
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | spn2-error:pending | 1
+ published | blocked-cookie | 1
+ published | | 1
+ published | pending | 1
+ submitted | success | 5308166
+ submitted | redirect-loop | 296322
+ submitted | terminal-bad-status | 197785
+ submitted | no-capture | 165545
+ submitted | no-pdf-link | 68274
+ submitted | wrong-mimetype | 5962
+ submitted | null-body | 5474
+ submitted | cdx-error | 3536
+ submitted | wayback-error | 2992
+ submitted | link-loop | 2359
+ submitted | gateway-timeout | 1805
+ submitted | petabox-error | 632
+ submitted | spn2-cdx-lookup-failure | 334
+ submitted | wayback-content-error | 57
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 31
+ submitted | spn2-error:job-failed | 14
+ submitted | | 12
+ submitted | invalid-host-resolution | 11
+ submitted | redirects-exceeded | 7
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | bad-gzip-encoding | 1
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 11
+ | no-capture | 3
+ (70 rows)
diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md
new file mode 100644
index 0000000..f9abe09
--- /dev/null
+++ b/notes/ingest/2020-11-04_arxiv.md
@@ -0,0 +1,12 @@
+
+Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run
+a crawl.
+
+Crawl is now done, so going to ingest, hoping to get the majority of the
+millions of remaining arxiv.org PDFs.
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l
+ => 1,288,559
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md
new file mode 100644
index 0000000..473dd0d
--- /dev/null
+++ b/notes/ingest/2020-11_doaj.md
@@ -0,0 +1,295 @@
+
+This is the first ingest (and crawl) of URLs from DOAJ article-level metadata.
+It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in
+the past.
+
+Working off a 2020-11-13 snapshot.
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:28 [4.57k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => ran in to error with blank `base_url`
+
+Second try after patches:
+
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:29 [4.56k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036})
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-------------------------+---------
+ pdf | | 3711532
+ html | | 2429003
+ pdf | success | 454403
+ pdf | redirect-loop | 48587
+ pdf | no-pdf-link | 24901
+ pdf | no-capture | 11569
+ xml | | 9442
+ pdf | link-loop | 8466
+ pdf | terminal-bad-status | 2015
+ pdf | wrong-mimetype | 1441
+ pdf | null-body | 1057
+ pdf | petabox-error | 299
+ pdf | cdx-error | 124
+ pdf | gateway-timeout | 114
+ pdf | wayback-error | 77
+ pdf | spn2-cdx-lookup-failure | 20
+ pdf | invalid-host-resolution | 4
+ pdf | spn2-error | 1
+ (18 rows)
+
+## Dump new URLs, Transform, Bulk Ingest (PDF and XML only)
+
+Dump:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.link_source = 'doaj'
+ -- AND date(ingest_request.created) > '2020-12-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json';
+ => COPY 3732543
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json
+ => 3.73M 0:02:18 [26.9k/s]
+
+Definitely some non-URL strings in there; should try to filter those out
+earlier in the transform process. And/or have a constraint on the URL column in
+the database.
+
+Enqueue the whole batch:
+
+ cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started this batch off at 2020-11-19 18:10 (Pacific time)
+
+Stats after run:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 30;
+
+## Dump Seedlist
+
+After preliminary bulk ingest attempts, dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json';
+ => 1,899,555
+
+TODO: filter for valid URLs
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt
+
+ wc -l doaj_seedlist_2020-11-19.*.txt
+
+## Post-Crawl Ingest
+
+Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ
+identifiers are all in fatcat:
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # started 2020-12-23 15:05 (Pacific)
+ # finished around 2020-12-31, after one long/slow partition
+
+Stats again after everything:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ html | wrong-scope | 1089423
+ html | no-capture | 423917
+ html | redirect-loop | 212910
+ html | unknown-scope | 204069
+ html | html-resource-no-capture | 165587
+ html | success | 122937
+ html | null-body | 100296
+ html | wayback-content-error | 53918
+ html | wrong-mimetype | 18908
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13520
+ html | cdx-error | 6823
+ html | wayback-error | 890
+ html | | 620
+ html | blocked-cookie | 543
+ html | blocked-captcha | 250
+ html | redirects-exceeded | 135
+ html | too-many-resources | 111
+ html | max-hops-exceeded | 84
+ html | bad-redirect | 3
+ pdf | success | 2851324
+ pdf | no-pdf-link | 529914
+ pdf | redirect-loop | 349494
+ pdf | no-capture | 272202
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91796
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2968
+ pdf | | 2068
+ pdf | wayback-content-error | 1548
+ pdf | cdx-error | 1095
+ pdf | petabox-error | 1024
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | timeout | 20
+ pdf | max-hops-exceeded | 19
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ xml | cdx-error | 3
+ (43 rows)
+
+
+And on filtered subset that we actually crawled:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-----------------------+---------
+ pdf | success | 2851286
+ pdf | no-pdf-link | 527495
+ pdf | redirect-loop | 345138
+ pdf | no-capture | 268140
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91125
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2907
+ pdf | petabox-error | 363
+ pdf | wayback-content-error | 242
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | max-hops-exceeded | 19
+ pdf | cdx-error | 15
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ (20 rows)
+
diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md
new file mode 100644
index 0000000..5979753
--- /dev/null
+++ b/notes/ingest/2020-12-08_patch_crawl_notes.md
@@ -0,0 +1,111 @@
+
+Notes here about re-ingesting or re-crawling large batches. Goal around end of
+2020 is to generate a broad patch crawl of terminal no-capture attempts for all
+major sources crawled thus far. Have already tried run this process for unpaywall.
+
+For each, want filtered ingest request JSON objects (filtering out platforms
+that don't crawl well, and possibly things like figshare+zenodo), and a broader
+seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a
+heritrix crawl with new config, then re-ingest all the requests individually.
+
+Summary of what to do here:
+
+ OA DOI: expecting some 2.4 million seeds
+ OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found
+ Unpaywall: another ~900k no-capture URLs (maybe filtered?)
+
+For all, re-attempt for these status codes:
+
+ no-capture
+ cdx-error
+ wayback-error
+ petabox-error
+ gateway-timeout (?)
+
+And at least do bulk re-ingest for these, if updated before 2020-11-20 or so:
+
+ no-pdf-link
+
+## OAI-PMH
+
+Need to re-ingest all of the (many!) no-capture and no-pdf-link
+
+TODO: repec-specific URL extraction?
+
+Skip these OAI prefixes:
+
+ kb.dk
+ bnf.fr
+ hispana.mcu.es
+ bdr.oai.bsb-muenchen.de
+ ukm.si
+ hsp.org
+
+Skip these domains:
+
+ www.kb.dk (kb.dk)
+ kb-images.kb.dk (kb.dk)
+ mdz-nbn-resolving.de (TODO: what prefix?)
+ aggr.ukm.um.si (ukm.si)
+
+Check PDF link extraction for these prefixes, or skip them (TODO):
+
+ repec (mixed success)
+ biodiversitylibrary.org
+ juser.fz-juelich.de
+ americanae.aecid.es
+ www.irgrid.ac.cn
+ hal
+ espace.library.uq.edu.au
+ igi.indrastra.com
+ invenio.nusl.cz
+ hypotheses.org
+ t2r2.star.titech.ac.jp
+ quod.lib.umich.edu
+
+ domain: hemerotecadigital.bne.es
+ domain: bib-pubdb1.desy.de
+ domain: publikationen.bibliothek.kit.edu
+ domain: edoc.mpg.de
+ domain: bibliotecadigital.jcyl.es
+ domain: lup.lub.lu.se
+ domain: orbi.uliege.be
+
+TODO:
+- consider deleting ingest requests from skipped prefixes (large database use)
+
+
+## Unpaywall
+
+About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`.
+
+Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) < '2020-11-20'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json';
+ => COPY 1309990
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json
+ => 1.31M 0:00:51 [25.6k/s]
+
+ cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md
new file mode 100644
index 0000000..d7643f4
--- /dev/null
+++ b/notes/ingest/2021-04_unpaywall.md
@@ -0,0 +1,368 @@
+
+New snapshot released 2021-02-18, finally getting around to a crawl two months
+later.
+
+Intend to do same style of crawl as in the past. One change is that
+sandcrawler-db has moved to a focal VM.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json
+ => 30.0M 3:14:59 [2.57k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json';
+ => COPY 3277484
+
+ # previous, 2020-10 run: COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json
+ => 3.28M 0:01:42 [32.1k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 26385866
+ no-pdf-link | 2132565
+ no-capture | 2092111
+ redirect-loop | 1732543
+ terminal-bad-status | 1504555
+ wayback-content-error | 357345
+ wrong-mimetype | 126070
+ link-loop | 76808
+ cdx-error | 22756
+ null-body | 22066
+ wayback-error | 13768
+ gateway-timeout | 3804
+ petabox-error | 3608
+ spn2-cdx-lookup-failure | 1225
+ redirects-exceeded | 892
+ invalid-host-resolution | 505
+ bad-redirect | 151
+ spn2-error | 108
+ spn2-error:job-failed | 91
+ bad-gzip-encoding | 27
+ (20 rows)
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-01-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 1348623
+ no-capture | 1231582
+ redirect-loop | 45622
+ no-pdf-link | 37312
+ terminal-bad-status | 24162
+ wrong-mimetype | 6684
+ link-loop | 5757
+ null-body | 1288
+ wayback-content-error | 1123
+ cdx-error | 831
+ petabox-error | 697
+ wayback-error | 185
+ invalid-host-resolution | 41
+ gateway-timeout | 29
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ spn2-cdx-lookup-failure | 7
+ bad-redirect | 4
+ timeout | 3
+ redirects-exceeded | 3
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json';
+ => 2020-10: 2,936,404
+ => 2021-04: 1,805,192
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json
+ => 1.81M 0:01:27 [20.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt
+ 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+ 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1,804,211 consumer group lag
+
+## Post-Ingest Stats
+
+Overall status (unpaywall, all time):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 27242251
+ no-pdf-link | 2746237
+ redirect-loop | 1821132
+ terminal-bad-status | 1553441
+ no-capture | 478559
+ wayback-content-error | 357390
+ wrong-mimetype | 127365
+ link-loop | 79389
+ cdx-error | 23170
+ null-body | 23169
+ wayback-error | 13704
+ gateway-timeout | 3803
+ petabox-error | 3642
+ redirects-exceeded | 1427
+ spn2-cdx-lookup-failure | 1214
+ invalid-host-resolution | 505
+ bad-redirect | 153
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ body-too-large | 84
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1213335
+ accepted | no-pdf-link | 29292
+ accepted | redirect-loop | 12769
+ accepted | terminal-bad-status | 11264
+ accepted | no-capture | 10187
+ accepted | cdx-error | 1015
+ accepted | wayback-content-error | 757
+ accepted | wrong-mimetype | 501
+ accepted | link-loop | 407
+ accepted | wayback-error | 207
+ accepted | petabox-error | 189
+ accepted | redirects-exceeded | 125
+ accepted | null-body | 34
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | blocked-cookie | 2
+ accepted | bad-redirect | 1
+ accepted | body-too-large | 1
+ published | success | 20196774
+ published | no-pdf-link | 2647969
+ published | redirect-loop | 1477558
+ published | terminal-bad-status | 1320013
+ published | wayback-content-error | 351931
+ published | no-capture | 297603
+ published | wrong-mimetype | 115440
+ published | link-loop | 76431
+ published | cdx-error | 18125
+ published | null-body | 17559
+ published | wayback-error | 10466
+ published | petabox-error | 2684
+ published | gateway-timeout | 1979
+ published | redirects-exceeded | 947
+ published | spn2-cdx-lookup-failure | 877
+ published | invalid-host-resolution | 457
+ published | bad-redirect | 120
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 70
+ published | body-too-large | 39
+ published | bad-gzip-encoding | 24
+ published | timeout | 24
+ published | blocked-cookie | 23
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | | 2
+ published | pending | 1
+ published | spn2-error:pending | 1
+ published | too-many-redirects | 1
+ submitted | success | 5832117
+ submitted | redirect-loop | 330785
+ submitted | terminal-bad-status | 222152
+ submitted | no-capture | 170766
+ submitted | no-pdf-link | 68934
+ submitted | wrong-mimetype | 11424
+ submitted | null-body | 5576
+ submitted | wayback-content-error | 4702
+ submitted | cdx-error | 4030
+ submitted | wayback-error | 3031
+ submitted | link-loop | 2551
+ submitted | gateway-timeout | 1820
+ submitted | petabox-error | 769
+ submitted | redirects-exceeded | 355
+ submitted | spn2-cdx-lookup-failure | 332
+ submitted | invalid-host-resolution | 48
+ submitted | body-too-large | 44
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 32
+ submitted | spn2-error:job-failed | 14
+ submitted | | 13
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | timeout | 4
+ submitted | bad-gzip-encoding | 3
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 12
+ | no-capture | 3
+ (76 rows)
+
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2192376
+ no-capture | 152183
+ no-pdf-link | 144174
+ redirect-loop | 125988
+ terminal-bad-status | 67307
+ link-loop | 8292
+ wrong-mimetype | 7942
+ null-body | 2270
+ cdx-error | 1223
+ wayback-content-error | 1147
+ petabox-error | 728
+ wayback-error | 155
+ body-too-large | 82
+ invalid-host-resolution | 41
+ gateway-timeout | 28
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ timeout | 7
+ bad-redirect | 6
+ redirects-exceeded | 4
+ (20 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 2,703,999 raw ingest requests (new URLs total)
+- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet
+- 843,753 (31.2%) success from new heritrix crawling
+- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md
new file mode 100644
index 0000000..e8748fa
--- /dev/null
+++ b/notes/ingest/2021-05_daily_improvements.md
@@ -0,0 +1,480 @@
+
+Summary of top large broken domains (2021-04-21 "30 day" snapshot):
+
+## acervus.unicamp.br
+
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5;
+
+http://acervus.unicamp.br/index.asp?codigo_sophia=963332
+
+seems like many of these were captures with a blank page? or a redirect to
+the homepage?
+
+http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html
+
+messy, going to move on.
+
+
+## apex.ipk-gatersleben.de
+
+apex.ipk-gatersleben.de | | 1253
+apex.ipk-gatersleben.de | no-pdf-link | 1132
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5;
+
+https://doi.org/10.25642/ipk/rescoll/4886
+https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331
+
+seem to be datasets/species, not articles.
+
+prefix: 10.25642/ipk
+
+## crossref.org
+
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+
+https://doi.org/10.1515/9781501747045-013
+https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013
+
+Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML.
+
+## openeditiong
+
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+
+https://doi.org/10.4000/books.pul.34492
+https://books.openedition.org/pul/34492
+
+these are not actually OA books (or at least, not all are)
+
+## chemrxiv.org (figshare)
+
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+
+https://doi.org/10.26434/chemrxiv.14411081
+https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081
+
+these all seem to be *multi-file* entities, thus not good for single file ingest pipeline.
+
+## direct.mit.edu
+
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+
+https://doi.org/10.7551/mitpress/14056.003.0004
+https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext
+
+"not available"
+
+https://doi.org/10.7551/mitpress/12444.003.0004
+
+"not available"
+
+
+## dlc.library.columbia.edu
+
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+
+https://doi.org/10.7916/d8-506w-kk49
+https://dlc.library.columbia.edu/durst/cul:18931zcrk9
+
+document repository.
+this one goes to IA! actually many seem to.
+added extractor, should re-ingest with:
+
+ publisher:"Columbia University" doi_prefix:10.7916 !journal:*
+
+actually, that is like 600k+ results and many are not digitized, so perhaps not.
+
+## doi.ala.org.au
+
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+
+https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052
+https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052
+
+this is a data repository, with filesets, not papers. datacite metadata is
+incorrect.
+
+## fldeploc.dep.state.fl.us
+
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+
+
+https://doi.org/10.35256/ic29
+http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29
+
+re-ingest with:
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+## geoscan.nrcan.gc.ca
+
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+
+https://doi.org/10.4095/295366
+https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366
+
+this is a geographic repository, not papers.
+
+## kiss.kstudy.com
+
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+
+https://doi.org/10.22143/hss21.12.1.121
+http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523
+
+Korean. seems to not actually be theses? can't download.
+
+## linkinghub.elsevier.com
+
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+
+skipping for now, looks like mostly 'forbidden'?
+
+## osf.io
+
+These are important!
+
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+
+https://doi.org/10.31219/osf.io/jux3w
+https://accounts.osf.io/login?service=https://osf.io/jux3w/download
+
+many of these are 404s by browser as well. what does that mean?
+
+## peerj.com
+
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+
+https://doi.org/10.7287/peerj.11155v0.1/reviews/2
+https://peerj.com/articles/11155/reviews/
+
+these are HTML reviews, not papers
+
+## preprints.jmir.org
+
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+
+https://doi.org/10.2196/preprints.22556
+https://preprints.jmir.org/preprint/22556
+
+UGH, looks simple, but javascript.
+
+could try to re-write URL into S3 format? meh.
+
+## psyarxiv.com (OSF?)
+
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+
+https://doi.org/10.31234/osf.io/5jaqg
+https://psyarxiv.com/5jaqg/
+
+Also infuriatingly Javascript, but can do URL hack.
+
+Should reingest, and potentially force-recrawl:
+
+ # about 67k
+ publisher:"Center for Open Science" in_ia:false
+
+## publons.com
+
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+
+https://doi.org/10.1002/jmor.21338/v2/review1
+https://publons.com/publon/40260824/
+
+These are just HTML reviews, not papers.
+
+## saemobilus.sae.org
+
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+
+https://doi.org/10.4271/as1426c
+https://saemobilus.sae.org/content/as1426c
+
+These seem to be standards, and are not open access (paywall)
+
+## scholar.dkyobobook.co.kr
+
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+
+https://doi.org/10.22471/crisis.2021.6.1.18
+http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536
+
+Korean. complex javascript, skipping.
+
+## unreserved.rba.gov.au
+
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+
+https://doi.org/10.47688/rba_archives_2006/04129
+https://unreserved.rba.gov.au/users/login
+
+Don't need to login when I tried in browser? document repo, not papers.
+
+## wayf.switch.ch
+
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+
+https://doi.org/10.24451/arbor.11128
+https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022
+
+Loginwall
+
+##
+
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+
+https://doi.org/10.5040/9781849664264.0008
+https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries
+
+These are primarily not OA/available.
+
+##
+
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+
+https://doi.org/10.22443/rms.emc2020.146
+https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html
+
+These are just abstracts, not papers.
+
+## Emerald
+
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+
+https://doi.org/10.1108/ramj-11-2020-0065
+https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html
+
+Note that these URLs are already HTML fulltext. but the PDF is also available and easy.
+
+re-ingest:
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+##
+
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+
+https://doi.org/10.5040/9781718206625.ch-002
+https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2
+
+paywall
+
+##
+
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+
+https://doi.org/10.1504/ijdmb.2020.10036342
+https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb
+
+paywall
+
+##
+
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+
+https://doi.org/10.15258/sst.2021.49.1.07
+https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02
+
+Annoying javascript, but easy to work around.
+
+re-ingest:
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+##
+
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+
+https://doi.org/10.5771/9783748907084-439
+https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren
+
+Javascript obfuscated download button?
+
+##
+
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+
+https://doi.org/10.1787/543e84ed-en
+https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en
+
+Paywall.
+
+##
+
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+
+https://doi.org/10.1364/boe.422199
+https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199
+
+Some of these are "pre-registered" DOIs, not published yet. Many of the
+remaining are actually HTML articles, and/or have some stuff in the
+`citation_pdf_url`. A core problem is captchas.
+
+Have started adding support to fatcat for HTML crawl type based on container.
+
+re-ingest:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+##
+
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+
+https://doi.org/10.1093/oseo/instance.00266789
+https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27
+
+loginwall/paywall
+
+##
+
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+
+https://doi.org/10.1127/zfg/40/1996/461
+https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref
+
+paywall
+
+##
+
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+
+https://doi.org/10.1016/j.landurbplan.2021.104104
+https://www.sciencedirect.com/science/article/pii/S0169204621000670
+
+Bunch of crazy new hacks, but seems to be working!
+
+re-ingest:
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2021
+
+##
+
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+
+https://doi.org/10.2478/awutm-2019-0012
+https://www.sciendo.com/article/10.2478/awutm-2019-0012
+
+uses lots of javascript, hard to scrape.
+
+
+## Others (for reference)
+
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+
+## Re-ingests
+
+All the above combined:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u
+ => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie
+ => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864})
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida"
+ => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843})
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald"
+ => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812})
+
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018"
+ => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140})
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2020
+ doi_prefix:10.1016 is_oa:true year:2021
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020"
+ => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021"
+ => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824})
+
+ pmcid:* year:2018
+ pmcid:* year:2019
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018"
+ => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019"
+ => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658})
+
diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md
new file mode 100644
index 0000000..8b6ac09
--- /dev/null
+++ b/notes/ingest/2021-07_unpaywall.md
@@ -0,0 +1,320 @@
+
+New snapshot released 2021-07-02. Should be "boring" ingest and crawl.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json
+ => 32.2M 3:01:52 [2.95k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260})
+
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+ => COPY 3556146
+
+ # previous, 2020-10 run: COPY 4216339
+ # previous, 2021-07 run: COPY 3277484
+
+Oops, should have run instead, with the date filter:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+
+But didn't, so processed all instead.
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json
+ => 3.56M 0:01:59 [29.8k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done, on 2021-07-13
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 1831827
+ success | 1343604
+ redirect-loop | 103999
+ terminal-bad-status | 19845
+ no-pdf-link | 17448
+ link-loop | 5027
+ wrong-mimetype | 2270
+ cdx-error | 523
+ body-too-large | 321
+ null-body | 298
+ wayback-content-error | 242
+ petabox-error | 155
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ wayback-error | 109
+ blocked-cookie | 9
+ timeout | 7
+ | 3
+ bad-redirect | 3
+ spn2-cdx-lookup-failure | 3
+ (20 rows)
+
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json';
+ => COPY 1743186
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json
+ => 1.74M 0:01:33 [18.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt
+ 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+ 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ 3287992 total
+
+Then run crawl (see `journal-crawls` git repo).
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1.74M 0:01:59 [14.6k/s]
+
+## Post-Ingest Stats
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2690258
+ redirect-loop | 227328
+ no-capture | 157368
+ terminal-bad-status | 118943
+ no-pdf-link | 92698
+ blocked-cookie | 19478
+ link-loop | 9249
+ wrong-mimetype | 4918
+ cdx-error | 1786
+ wayback-error | 1497
+ null-body | 1302
+ body-too-large | 433
+ wayback-content-error | 245
+ petabox-error | 171
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ timeout | 12
+ bad-redirect | 4
+ | 3
+ spn2-cdx-lookup-failure | 1
+ (20 rows)
+
+Only the recent updates, by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------+---------
+ accepted | success | 103144
+ accepted | no-pdf-link | 53981
+ accepted | terminal-bad-status | 4102
+ accepted | link-loop | 2799
+ accepted | no-capture | 2315
+ accepted | redirect-loop | 2171
+ accepted | blocked-cookie | 234
+ accepted | cdx-error | 140
+ accepted | wayback-error | 101
+ accepted | wrong-mimetype | 38
+ accepted | null-body | 10
+ accepted | petabox-error | 5
+ accepted | wayback-content-error | 4
+ accepted | gateway-timeout | 2
+ accepted | body-too-large | 2
+ published | success | 1919100
+ published | no-capture | 130104
+ published | redirect-loop | 127482
+ published | terminal-bad-status | 43118
+ published | no-pdf-link | 33505
+ published | blocked-cookie | 19034
+ published | link-loop | 6241
+ published | wrong-mimetype | 4163
+ published | null-body | 1195
+ published | cdx-error | 1151
+ published | wayback-error | 1105
+ published | wayback-content-error | 197
+ published | body-too-large | 195
+ published | petabox-error | 118
+ published | gateway-timeout | 35
+ published | invalid-host-resolution | 13
+ published | timeout | 8
+ published | bad-redirect | 2
+ published | spn2-cdx-lookup-failure | 1
+ published | bad-gzip-encoding | 1
+ submitted | success | 668014
+ submitted | redirect-loop | 97675
+ submitted | terminal-bad-status | 71723
+ submitted | no-capture | 24949
+ submitted | no-pdf-link | 5212
+ submitted | wrong-mimetype | 717
+ submitted | cdx-error | 495
+ submitted | wayback-error | 291
+ submitted | body-too-large | 236
+ submitted | blocked-cookie | 210
+ submitted | link-loop | 209
+ submitted | invalid-host-resolution | 107
+ submitted | gateway-timeout | 101
+ submitted | null-body | 97
+ submitted | petabox-error | 48
+ submitted | wayback-content-error | 44
+ submitted | timeout | 4
+ submitted | | 3
+ submitted | bad-redirect | 2
+ submitted | remote-server-error | 1
+ (55 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 3,325,954 raw ingest requests (new URLs total)
+- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl
+- 1,346,654 (77% of crawled) success from new heritrix crawling
+- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
+
+## Live Ingest Follow-Up
+
+Will run SPN requests on the ~160k `no-capture` URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json';
+ => COPY 157371
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json
+ => 157k 0:00:04 [31.6k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md
new file mode 100644
index 0000000..5f92196
--- /dev/null
+++ b/notes/ingest/2021-08_mag.md
@@ -0,0 +1,400 @@
+
+Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest.
+Also want to re-ingest some old/failed ingests, now that pipeline/code has
+improved.
+
+Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs.
+
+
+## Persist Ingest Requests
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000})
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 22.5M 0:46:00 [8.16k/s]
+ => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585})
+
+Roughly 8.6 million new URLs
+
+## Pre-Crawl Status Counts
+
+Status of combined old and new requests, with some large domains removed:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ success | 26123975
+ | 6664846
+ no-pdf-link | 1859908
+ redirect-loop | 1532405
+ no-capture | 1199126
+ link-loop | 1157010
+ terminal-bad-status | 832362
+ gateway-timeout | 202158
+ spn2-cdx-lookup-failure | 81406
+ wrong-mimetype | 69087
+ invalid-host-resolution | 37262
+ wayback-error | 21340
+ petabox-error | 11237
+ null-body | 9414
+ wayback-content-error | 2199
+ cdx-error | 1893
+ spn2-error | 1741
+ spn2-error:job-failed | 971
+ blocked-cookie | 902
+ spn2-error:invalid-url-syntax | 336
+ (20 rows)
+
+And just the new URLs (note that domain filter shouldn't be required, but
+keeping for consistency):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ | 6664780
+ success | 1957844
+ redirect-loop | 23357
+ terminal-bad-status | 9385
+ no-pdf-link | 8315
+ no-capture | 6892
+ link-loop | 4517
+ wrong-mimetype | 3864
+ cdx-error | 1749
+ blocked-cookie | 842
+ null-body | 747
+ wayback-error | 688
+ wayback-content-error | 570
+ gateway-timeout | 367
+ petabox-error | 340
+ spn2-cdx-lookup-failure | 150
+ read-timeout | 122
+ not-found | 119
+ invalid-host-resolution | 63
+ spn2-error | 23
+ (20 rows)
+
+## Dump Initial Bulk Ingest Requests
+
+Note that this is all-time, not just recent, and will re-process a lot of
+"no-pdf-link":
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-pdf-link'
+ OR ingest_file_result.status = 'cdx-error'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json';
+ => COPY 8526647
+
+Transform to ingest requests:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json
+ => 8.53M 0:03:40
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+Updated stats after running initial bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 5184994
+ no-capture | 3284416
+ redirect-loop | 98685
+ terminal-bad-status | 28733
+ link-loop | 28518
+ blocked-cookie | 22338
+ no-pdf-link | 19073
+ wrong-mimetype | 9122
+ null-body | 2793
+ wayback-error | 2128
+ wayback-content-error | 1233
+ cdx-error | 1198
+ petabox-error | 617
+ gateway-timeout | 395
+ not-found | 130
+ read-timeout | 128
+ | 111
+ invalid-host-resolution | 63
+ spn2-cdx-lookup-failure | 24
+ spn2-error | 20
+ (20 rows)
+
+## Generate Seedlist
+
+For crawling, do a similar (but not identical) dump:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json';
+ => COPY 4599519
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json
+ => 4.60M 0:02:55 [26.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+ cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ => DONE
+
+ wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt
+ 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+
+## Post-Crawl Bulk Re-Ingest
+
+Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by
+hash, URL agnostic).
+
+Enqueue for buik re-ingest:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => Thu 19 Aug 2021 09:10:59 PM UTC
+
+
+## Post-Ingest Stats
+
+Just the new stuff (compare against above for delta):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 7748241 89.2%
+ no-capture | 429688 4.9%
+ redirect-loop | 172831 2.0%
+ terminal-bad-status | 94029 1.1%
+ no-pdf-link | 86437 1.0%
+ blocked-cookie | 67903 0.8%
+ link-loop | 50622
+ wrong-mimetype | 21064
+ null-body | 6650
+ cdx-error | 3313
+ wayback-error | 2630
+ gateway-timeout | 399
+ petabox-error | 268
+ wayback-content-error | 170
+ not-found | 130
+ read-timeout | 128
+ | 109
+ invalid-host-resolution | 63
+ bad-redirect | 39
+ spn2-error | 20
+ (20 rows)
+
+New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397
+
+Overall success of new batch: 7748241. / 8686315 = 89.2%
+
+And combined (old and new) status again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 31990062
+ redirect-loop | 1704717
+ no-capture | 1263462
+ link-loop | 1218280
+ blocked-cookie | 1213838
+ no-pdf-link | 1096664
+ terminal-bad-status | 960070
+ gateway-timeout | 202190
+ wrong-mimetype | 86557
+ invalid-host-resolution | 37262
+ null-body | 15443
+ wayback-error | 12839
+ cdx-error | 4047
+ spn2-error | 1731
+ spn2-error:job-failed | 962
+ petabox-error | 463
+ wayback-content-error | 379
+ spn2-error:invalid-url-syntax | 336
+ spn2-error:soft-time-limit-exceeded | 203
+ | 175
+ (20 rows)
+
+New success total: 31990062 - 26123975 = 5,866,087
+
+A full 1,263,462 no-capture that could be attempted... though many of those may
+be excluded for a specific reason.
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
new file mode 100644
index 0000000..ac808dd
--- /dev/null
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -0,0 +1,1578 @@
+
+Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
+re-crawling content which failed to ingest the first time.
+
+May fold this in with more general patch crawling.
+
+## Basic Counts
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 14145387
+ no-pdf-link | 12063022
+ no-capture | 5485640
+ redirect-loop | 2092705
+ terminal-bad-status | 747372
+ wrong-mimetype | 597219
+ link-loop | 542144
+ null-body | 93566
+ cdx-error | 19798
+ petabox-error | 17943
+ | 15283
+ wayback-error | 13897
+ gateway-timeout | 511
+ skip-url-blocklist | 184
+ wayback-content-error | 146
+ bad-redirect | 137
+ redirects-exceeded | 120
+ bad-gzip-encoding | 116
+ timeout | 80
+ blocked-cookie | 64
+ (20 rows)
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 40;
+
+
+ oai_prefix | success | total
+ ---------------------------+---------+---------
+ repec | 1133175 | 2783448
+ hal | 573218 | 1049607
+ www.irgrid.ac.cn | 18007 | 748828
+ cds.cern.ch | 74078 | 688091
+ americanae.aecid.es | 71310 | 572792
+ juser.fz-juelich.de | 23026 | 518551
+ espace.library.uq.edu.au | 6649 | 508960
+ igi.indrastra.com | 59629 | 478577
+ archive.ugent.be | 65306 | 424014
+ hrcak.srce.hr | 404085 | 414897
+ zir.nsk.hr | 156753 | 397200
+ renati.sunedu.gob.pe | 79362 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7997 | 354529
+ generic.eprints.org | 263566 | 340470
+ invenio.nusl.cz | 6340 | 325867
+ evastar-karlsruhe.de | 62282 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ diva.org | 67917 | 298348
+ t2r2.star.titech.ac.jp | 1085 | 289388
+ edpsciences.org | 139495 | 284972
+ repository.ust.hk | 10245 | 283417
+ revues.org | 151156 | 277497
+ pure.atira.dk | 13492 | 260754
+ bibliotecadigital.jcyl.es | 50606 | 254134
+ escholarship.org/ark | 140835 | 245203
+ ojs.pkp.sfu.ca | 168029 | 229387
+ lup.lub.lu.se | 49358 | 226602
+ library.wur.nl | 15051 | 216738
+ digitalrepository.unm.edu | 111704 | 211749
+ infoscience.tind.io | 60166 | 207299
+ edoc.mpg.de | 0 | 205252
+ erudit.org | 168490 | 197803
+ delibra.bg.polsl.pl | 38666 | 196652
+ n/a | 0 | 193814
+ aleph.bib-bvb.de | 4349 | 186666
+ serval.unil.ch | 41643 | 186372
+ orbi.ulg.ac.be | 2400 | 184551
+ digitalcommons.unl.edu | 144025 | 184372
+ bib-pubdb1.desy.de | 33525 | 182717
+ (40 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 50;
+
+ oai_prefix | status | count
+ ---------------------------+---------------+---------
+ repec | success | 1133175
+ repec | no-pdf-link | 638105
+ hal | success | 573218
+ cds.cern.ch | no-capture | 540380
+ repec | redirect-loop | 516451
+ juser.fz-juelich.de | no-pdf-link | 477881
+ americanae.aecid.es | no-pdf-link | 417766
+ hrcak.srce.hr | success | 404085
+ www.irgrid.ac.cn | no-pdf-link | 370908
+ hal | no-pdf-link | 359252
+ www.irgrid.ac.cn | no-capture | 355532
+ espace.library.uq.edu.au | no-pdf-link | 320479
+ igi.indrastra.com | no-pdf-link | 318242
+ repec | no-capture | 316981
+ invenio.nusl.cz | no-pdf-link | 309802
+ rour.neicon.ru | redirect-loop | 300911
+ hypotheses.org | no-pdf-link | 300251
+ renati.sunedu.gob.pe | no-capture | 282800
+ t2r2.star.titech.ac.jp | no-pdf-link | 272045
+ generic.eprints.org | success | 263566
+ quod.lib.umich.edu | no-pdf-link | 259661
+ archive.ugent.be | no-capture | 256127
+ evastar-karlsruhe.de | no-pdf-link | 248939
+ zir.nsk.hr | link-loop | 226919
+ repository.ust.hk | no-pdf-link | 208569
+ edoc.mpg.de | no-pdf-link | 199758
+ bibliotecadigital.jcyl.es | no-pdf-link | 188433
+ orbi.ulg.ac.be | no-pdf-link | 172373
+ diva.org | no-capture | 171115
+ lup.lub.lu.se | no-pdf-link | 168652
+ erudit.org | success | 168490
+ ojs.pkp.sfu.ca | success | 168029
+ lib.dr.iastate.edu | success | 158494
+ zir.nsk.hr | success | 156753
+ digital.kenyon.edu | success | 154900
+ revues.org | success | 151156
+ books.openedition.org | no-pdf-link | 149607
+ freidok.uni-freiburg.de | no-pdf-link | 146837
+ digitalcommons.unl.edu | success | 144025
+ escholarship.org/ark | success | 140835
+ culeuclid | link-loop | 140291
+ edpsciences.org | success | 139495
+ serval.unil.ch | no-pdf-link | 138644
+ bib-pubdb1.desy.de | no-pdf-link | 133815
+ krm.or.kr | no-pdf-link | 132461
+ pure.atira.dk | no-pdf-link | 132179
+ oai-gms.dimdi.de | redirect-loop | 131409
+ aleph.bib-bvb.de | no-capture | 128261
+ library.wur.nl | no-pdf-link | 124718
+ lirias2repo.kuleuven.be | no-capture | 123106
+ (50 rows)
+
+Note: could just delete the "excluded" rows? and not harvest them in the
+future, and filter them at ingest time (in transform script).
+
+
+
+## Investigate no-pdf-link sandcrawler improvements
+
+Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works:
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%'
+ ORDER BY random()
+ LIMIT 10;
+
+Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works):
+
+ \x auto
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ ORDER BY random()
+ LIMIT 30;
+
+### repec (SKIP-PREFIX)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35
+base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html
+terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647
+base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf
+terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75
+base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec
+terminal_url | https://www.jstor.org/stable/1884373
+
+Huh! This is just a catalog of other domains. Should probably skip
+
+DONE: skip/filter repec
+
+### juser.fz-juelich.de (SCOPE)
+
+-[ RECORD 1 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:132217
+base_url | http://juser.fz-juelich.de/record/132217
+terminal_url | http://juser.fz-juelich.de/record/132217
+
+Poster; no files.
+
+-[ RECORD 2 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:268598
+base_url | http://juser.fz-juelich.de/record/268598
+terminal_url | http://juser.fz-juelich.de/record/268598
+
+Journal.
+
+-[ RECORD 3 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:126613
+base_url | http://juser.fz-juelich.de/record/126613
+terminal_url | http://juser.fz-juelich.de/record/126613
+
+-[ RECORD 4 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:67362
+base_url | http://juser.fz-juelich.de/record/67362
+terminal_url | http://juser.fz-juelich.de/record/67362
+-[ RECORD 5 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:869189
+base_url | http://juser.fz-juelich.de/record/869189
+terminal_url | http://juser.fz-juelich.de/record/869189
+-[ RECORD 6 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:810746
+base_url | http://juser.fz-juelich.de/record/810746
+terminal_url | http://juser.fz-juelich.de/record/810746
+-[ RECORD 7 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:52897
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+-[ RECORD 8 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:114755
+base_url | http://juser.fz-juelich.de/record/114755
+terminal_url | http://juser.fz-juelich.de/record/114755
+-[ RECORD 9 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:58025
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+
+The search URLs seem redundant? Not going to try to handle those.
+
+"Powered by Invenio v1.1.7"
+
+All of these examples seem to be not papers. Maybe we can filter these better
+at the harvest or transform stage?
+
+### americanae.aecid.es (MIXED)
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:502896
+base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+
+just a metadata record? links to redalyc
+
+METADATA-ONLY
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:534600
+base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:524567
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+
+NOT-FOUND (404)
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:378914
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+
+Some single-page image archival thing? bespoke, skipping.
+
+SKIP-BESPOKE
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:526142
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+
+NOT-FOUND (404)
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:373408
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+
+NOT-FOUND (404)
+
+### www.irgrid.ac.cn (SKIP-PREFIX)
+
+Chinese Academy of Sciences Institutional Repositories Grid
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1749980
+base_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+
+Can't access
+
+FORBIDDEN
+
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/857397
+base_url | http://www.irgrid.ac.cn/handle/1471x/857397
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397
+
+Just linking to another IR; skip it.
+
+http://ir.ipe.ac.cn/handle/122111/10608
+
+requires login
+
+DONE: '/password-login;jsessionid' as a loginwall URL pattern
+ http://ir.ipe.ac.cn/handle/122111/10608
+ http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf
+
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1060447
+base_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1671377
+base_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1178430
+base_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2488017
+base_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/977147
+base_url | http://www.irgrid.ac.cn/handle/1471x/977147
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2454503
+base_url | http://ir.nwipb.ac.cn/handle/363003/9957
+terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957
+
+this domain is a disapointment :(
+
+should continue crawling, as the metadata is open and good. but won't get fulltext?
+
+### hal (FIXED-PARTIAL)
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00744951v1
+base_url | https://hal.archives-ouvertes.fr/hal-00744951
+terminal_url | https://hal.archives-ouvertes.fr/hal-00744951
+
+Off-site OA link.
+
+FIXED-HAL
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-01065398v1
+base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf
+terminal_url | https://hal.archives-ouvertes.fr/index/index
+-[ RECORD 3 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:lirmm-00371599v1
+base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+
+To elsevier :(
+
+-[ RECORD 4 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00284780v1
+base_url | https://hal.archives-ouvertes.fr/hal-00284780
+terminal_url | https://hal.archives-ouvertes.fr/hal-00284780
+
+METADATA-ONLY
+
+-[ RECORD 5 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00186151v1
+base_url | https://hal.archives-ouvertes.fr/hal-00186151
+terminal_url | https://hal.archives-ouvertes.fr/hal-00186151
+
+METADATA-ONLY
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00399754v1
+base_url | https://hal.archives-ouvertes.fr/hal-00399754
+terminal_url | https://hal.archives-ouvertes.fr/hal-00399754
+
+METADATA-ONLY
+
+
+### espace.library.uq.edu.au (SKIP)
+
+-[ RECORD 1 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:136497
+base_url | https://espace.library.uq.edu.au/view/UQ:136497
+terminal_url | https://espace.library.uq.edu.au/view/UQ:136497
+-[ RECORD 2 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:411389
+base_url | https://espace.library.uq.edu.au/view/UQ:411389
+terminal_url | https://espace.library.uq.edu.au/view/UQ:411389
+-[ RECORD 3 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:401773
+base_url | https://espace.library.uq.edu.au/view/UQ:401773
+terminal_url | https://espace.library.uq.edu.au/view/UQ:401773
+-[ RECORD 4 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:675334
+base_url | https://espace.library.uq.edu.au/view/UQ:675334
+terminal_url | https://espace.library.uq.edu.au/view/UQ:675334
+-[ RECORD 5 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:312311
+base_url | https://espace.library.uq.edu.au/view/UQ:312311
+terminal_url | https://espace.library.uq.edu.au/view/UQ:312311
+-[ RECORD 6 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:209401
+base_url | https://espace.library.uq.edu.au/view/UQ:209401
+terminal_url | https://espace.library.uq.edu.au/view/UQ:209401
+-[ RECORD 7 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:327188
+base_url | https://espace.library.uq.edu.au/view/UQ:327188
+terminal_url | https://espace.library.uq.edu.au/view/UQ:327188
+
+Very javascript heavy (skeletal HTML). And just links to fulltext on publisher
+sites.
+
+### igi.indrastra.com (METADATA-ONLY)
+
+-[ RECORD 1 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:267221
+base_url | http://igi.indrastra.com/items/show/267221
+terminal_url | http://igi.indrastra.com/items/show/267221
+-[ RECORD 2 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:181799
+base_url | http://igi.indrastra.com/items/show/181799
+terminal_url | http://igi.indrastra.com/items/show/181799
+-[ RECORD 3 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:125382
+base_url | http://igi.indrastra.com/items/show/125382
+terminal_url | http://igi.indrastra.com/items/show/125382
+-[ RECORD 4 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:47266
+base_url | http://igi.indrastra.com/items/show/47266
+terminal_url | http://igi.indrastra.com/items/show/47266
+-[ RECORD 5 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:12872
+base_url | http://igi.indrastra.com/items/show/12872
+terminal_url | http://igi.indrastra.com/items/show/12872
+-[ RECORD 6 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:231620
+base_url | http://igi.indrastra.com/items/show/231620
+terminal_url | http://igi.indrastra.com/items/show/231620
+
+"Proudly powered by Omeka"
+
+### invenio.nusl.cz (METADATA-ONLY)
+
+ oai_id | base_url | terminal_url
+----------------------------+------------------------------------+--------------------------------------
+ oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409
+ oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783
+ oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961
+ oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800
+ oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695
+ oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393
+ oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987
+ oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396
+ oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512
+ oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631
+
+Metadata only (at least this set)
+
+### hypotheses.org
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:mittelalter/9529
+base_url | http://mittelalter.hypotheses.org/9529
+terminal_url | https://mittelalter.hypotheses.org/9529
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/18638
+base_url | http://archivalia.hypotheses.org/18638
+terminal_url | https://archivalia.hypotheses.org/18638
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/13614
+base_url | http://archivalia.hypotheses.org/13614
+terminal_url | https://archivalia.hypotheses.org/13614
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:teteschercheuses/2785
+base_url | http://teteschercheuses.hypotheses.org/2785
+terminal_url | https://teteschercheuses.hypotheses.org/2785
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:altervsego/608
+base_url | http://altervsego.hypotheses.org/608
+terminal_url | http://altervsego.hypotheses.org/608
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivewk1/21905
+base_url | http://archivewk1.hypotheses.org/21905
+terminal_url | https://archivewk1.hypotheses.org/21905
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:slkdiaspo/3321
+base_url | http://slkdiaspo.hypotheses.org/3321
+terminal_url | https://slkdiaspo.hypotheses.org/3321
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:diga/280
+base_url | http://diga.hypotheses.org/280
+terminal_url | https://diga.hypotheses.org/280
+
+These are all a big mix... basically blogs. Should continue crawling, but expect no yield.
+
+### t2r2.star.titech.ac.jp (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00105099
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00101346
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50161100
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00232407
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50120040
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50321440
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50235666
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+
+
+### quod.lib.umich.edu
+
+-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2
+base_url | http://name.umdl.umich.edu/acf2679.0015.003
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:b14970.0001.001
+base_url | http://name.umdl.umich.edu/B14970.0001.001
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3
+base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43
+base_url | http://name.umdl.umich.edu/acg2248.1-16.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9
+base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9
+base_url | http://name.umdl.umich.edu/acg1336.1-24.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a
+base_url | http://name.umdl.umich.edu/africanamer.0002.32a
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a
+
+These are... issues of journals? Should continue to crawl, but not expect much.
+
+### evastar-karlsruhe.de (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:270011444
+base_url | https://publikationen.bibliothek.kit.edu/270011444
+terminal_url | https://publikationen.bibliothek.kit.edu/270011444
+-[ RECORD 2 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000050117
+base_url | https://publikationen.bibliothek.kit.edu/1000050117
+terminal_url | https://publikationen.bibliothek.kit.edu/1000050117
+-[ RECORD 3 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:362296
+base_url | https://publikationen.bibliothek.kit.edu/362296
+terminal_url | https://publikationen.bibliothek.kit.edu/362296
+-[ RECORD 4 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:23042000
+base_url | https://publikationen.bibliothek.kit.edu/23042000
+terminal_url | https://publikationen.bibliothek.kit.edu/23042000
+-[ RECORD 5 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000069945
+base_url | https://publikationen.bibliothek.kit.edu/1000069945
+terminal_url | https://publikationen.bibliothek.kit.edu/1000069945
+
+
+### repository.ust.hk
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-67233
+base_url | http://repository.ust.hk/ir/Record/1783.1-67233
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-63232
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017
+terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-2891
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103
+terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-56231
+base_url | http://repository.ust.hk/ir/Record/1783.1-56231
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231
+
+[...]
+
+-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-24872
+base_url | http://repository.ust.hk/ir/Record/1783.1-24872
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872
+-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-3457
+base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-73215
+base_url | http://repository.ust.hk/ir/Record/1783.1-73215
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215
+
+DONE: gateway.isiknowledge.com is bogus/blocking?
+
+
+### edoc.mpg.de (SKIP-DEPRECATED)
+
+ oai_id | base_url | terminal_url
+------------------------+---------------------------+---------------------------
+ oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650
+ oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195
+ oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655
+ oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179
+ oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141
+ oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412
+ oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531
+ oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047
+ oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650
+ oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852
+
+This whole instance seems to have been replaced
+
+### bibliotecadigital.jcyl.es (SKIP-DIGITIZED)
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000039962
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+-[ RECORD 2 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14075
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+-[ RECORD 3 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:4842
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+-[ RECORD 4 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14799
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+-[ RECORD 5 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:821
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+
+Digitized images as pages; too much to deal with for now.
+
+### orbi.ulg.ac.be
+
+-[ RECORD 1 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/128079
+base_url | https://orbi.uliege.be/handle/2268/128079
+terminal_url | https://orbi.uliege.be/handle/2268/128079
+-[ RECORD 2 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/67659
+base_url | https://orbi.uliege.be/handle/2268/67659
+terminal_url | https://orbi.uliege.be/handle/2268/67659
+-[ RECORD 3 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/35521
+base_url | https://orbi.uliege.be/handle/2268/35521
+terminal_url | https://orbi.uliege.be/handle/2268/35521
+-[ RECORD 4 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/107922
+base_url | https://orbi.uliege.be/handle/2268/107922
+terminal_url | https://orbi.uliege.be/handle/2268/107922
+-[ RECORD 5 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/215694
+base_url | https://orbi.uliege.be/handle/2268/215694
+terminal_url | https://orbi.uliege.be/handle/2268/215694
+
+Described below.
+
+### library.wur.nl (FIXED-BESPOKE)
+
+ oai_id | base_url | terminal_url
+ -----------------------------------+------------------------------------------------+------------------------------------------------
+ oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939
+ oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707
+ oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208
+ oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378
+ oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416
+ oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930
+ oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076
+ oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109
+ oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146
+ oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922
+ (10 rows)
+
+Seems like a one-off site? But added a pattern.
+
+### pure.atira.dk
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38
+base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694
+terminal_url | https://www.tandfonline.com/action/cookieAbsent
+-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+
+Metadata only
+
+DONE: /cookieAbsent is cookie block
+ https://www.tandfonline.com/action/cookieAbsent
+
+### bib-pubdb1.desy.de (FIXED-INVENIO)
+
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:96756
+base_url | http://bib-pubdb1.desy.de/record/96756
+terminal_url | http://bib-pubdb1.desy.de/record/96756
+
+Metadata only.
+
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:416556
+base_url | http://bib-pubdb1.desy.de/record/416556
+terminal_url | http://bib-pubdb1.desy.de/record/416556
+
+Fixed!
+
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:414545
+base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:170169
+base_url | http://bib-pubdb1.desy.de/record/170169
+terminal_url | http://bib-pubdb1.desy.de/record/170169
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:191154
+base_url | http://bib-pubdb1.desy.de/record/191154
+terminal_url | http://bib-pubdb1.desy.de/record/191154
+
+Metadata only
+
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:155092
+base_url | http://bib-pubdb1.desy.de/record/155092
+terminal_url | http://bib-pubdb1.desy.de/record/155092
+
+Fixed!
+
+-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:97158
+base_url | http://bib-pubdb1.desy.de/record/97158
+terminal_url | http://bib-pubdb1.desy.de/record/97158
+
+Metadata only
+
+"Powered by Invenio v1.1.7"
+
+Can/should skip the "search" URLs
+
+### serval.unil.ch
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_60346fc75171
+base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_4db47fc4b593
+base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_57aac24fe115
+base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_deabae6baf6c
+base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_a5ec0df1370f
+base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_080300c2e23c
+base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_de777dd2b07f
+base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F
+-[ RECORD 8 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_5e824e244c27
+base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27
+
+Metadata only? See elsewhere.
+
+### Random Links
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dbc.wroc.pl:41031
+base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+
+This is some platform/package thing. PDF is in an iframe. Platform is "DLibra".
+FIXED-DLIBRA
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/174291
+base_url | https://orbi.uliege.be/handle/2268/174291
+terminal_url | https://orbi.uliege.be/handle/2268/174291
+
+DSpace platform. There are multiple files, and little to "select" on.
+
+https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with
+
+PARTIAL-DSPACE
+
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.tue.nl:664163
+base_url | http://repository.tue.nl/664163
+terminal_url | http://repository.tue.nl/664163
+
+Ah, this is the Pure platform from Elsevier.
+Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance
+
+FIXED-PURE
+
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:49579
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+
+(handled above)
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/97937
+base_url | https://orcid.org/0000-0002-2066-2082
+terminal_url | https://orcid.org/0000-0002-2066-2082
+
+ORCID! Skip it.
+
+DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time.
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:edoc.mpg.de:360269
+base_url | http://edoc.mpg.de/360269
+terminal_url | http://edoc.mpg.de/360269
+
+Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure?
+
+DONE: edoc.mpg.de -> pure.mpg.de
+
+-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:books.openedition.org:msha/17716
+base_url | http://books.openedition.org/msha/17716
+terminal_url | https://books.openedition.org/msha/17716
+
+Open edition is free to read HTML, but not PDF (or epub, etc).
+
+TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest)
+
+HTML-WORKED
+
+-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epub.oeaw.ac.at:0x003aba48
+base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+
+requires login
+
+FORBIDDEN
+
+-[ RECORD 9 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/88986
+base_url | https://orcid.org/0000-0002-4147-2560
+terminal_url | https://orcid.org/0000-0002-4147-2560
+
+DONE: skip orcids
+
+-[ RECORD 10 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-28786
+base_url | http://repository.ust.hk/ir/Record/1783.1-28786
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786
+
+Generator: VuFind 5.1.1
+just a metadata record
+
+METADATA-ONLY
+
+-[ RECORD 11 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:rcin.org.pl:50797
+base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+
+Seems like a software platform? not sure.
+
+METADATA-ONLY
+
+-[ RECORD 12 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dea.lib.unideb.hu:2437/69641
+base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+
+-[ RECORD 13 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871
+base_url | http://handle.unsw.edu.au/1959.4/64871
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L
+
+-[ RECORD 14 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.wbc.poznan.pl:225930
+base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+
+SOFT-404
+
+-[ RECORD 15 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.erciyes.edu.tr:105
+base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105
+terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105
+
+GONE (domain not registered)
+
+-[ RECORD 16 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:37500
+base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+
+Seems like a bespoke site
+
+SKIP-BESPOKE
+
+-[ RECORD 17 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50401364
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+
+METADATA-ONLY
+
+-[ RECORD 18 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epubs.cclrc.ac.uk:work/4714
+base_url | http://purl.org/net/epubs/work/4714
+terminal_url | https://epubs.stfc.ac.uk/work/4714
+
+It's got a purl! haha.
+
+METADATA-ONLY
+
+------
+
+Another batch! With some repeat domains removed.
+
+-[ RECORD 1 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc
+base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc
+terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov
+
+SKIP
+
+-[ RECORD 2 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-05302014-183910
+base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+
+Some software platform? Pretty basic/bespoke
+
+FIXED-PARTIAL
+
+-[ RECORD 3 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000098246
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+
+SKIP (see elsewhere)
+
+-[ RECORD 7 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:elektra.cdaea.es:documento.29259
+base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+
+Photo.
+
+SKIP-SCOPE
+
+-[ RECORD 9 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829
+base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L
+
+METADATA-ONLY
+
+-[ RECORD 12 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a
+base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+
+unsure
+
+-[ RECORD 16 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/369344
+base_url | https://library.wur.nl/WebQuery/wurpubs/369344
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344
+
+this specific record not OA (but site is fine/fixed)
+
+-[ RECORD 17 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:escholarship.umassmed.edu:oapubs-2146
+base_url | https://escholarship.umassmed.edu/oapubs/1147
+terminal_url | http://escholarship.umassmed.edu/oapubs/1147/
+
+just links to publisher (no content in repo)
+
+-[ RECORD 18 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010
+base_url | https://digitalcommons.usu.edu/wild_facpub/11
+terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/
+
+also just links to publisher (no content in repo)
+
+-[ RECORD 25 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:igi.indrastra.com:306768
+base_url | http://igi.indrastra.com/items/show/306768
+terminal_url | http://igi.indrastra.com/items/show/306768
+
+(see elsewhere)
+
+-[ RECORD 26 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:fau.digital.flvc.org:fau_9804
+base_url | http://purl.flvc.org/fcla/dt/12932
+terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804
+
+Islandora.
+
+-[ RECORD 27 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.lu.lv:7/16019
+base_url | https://dspace.lu.lv/dspace/handle/7/16019
+terminal_url | https://dspace.lu.lv/dspace/handle/7/16019
+
+LOGINWALL
+
+-[ RECORD 28 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:zir.nsk.hr:umas_218
+base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+
+REMOVED
+
+
+-[ RECORD 29 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:36390
+base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+
+Book, with chapters, not an individual work.
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:krm.or.kr:10056135m201r
+base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y
+terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135
+
+research results repository; keep crawling
+
+SKIP-SCOPE
+
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.db-thueringen.de:dbt_mods_00005191
+base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+
+powered by "MyCoRe"
+
+FIXED-MYCORE
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405
+base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+
+seems to be a general purpose regional library? not research-specific
+
+SKIP-UNSURE
+
+-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-02272019-123644
+base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+
+This specific URL is not available (FORBIDDEN)
+
+others have multiple files, not just a single PDF:
+https://etd.adm.unipi.it/t/etd-09102013-124430/
+
+SKIP-UNSURE
+
+-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:commons.ln.edu.hk:sw_master-5408
+base_url | https://commons.ln.edu.hk/sw_master/4408
+terminal_url | https://commons.ln.edu.hk/sw_master/4408/
+
+worth crawling I guess
+
+METADATA-ONLY
+
+-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:mouseion.jax.org:ssbb1976-1224
+base_url | https://mouseion.jax.org/ssbb1976/225
+terminal_url | https://mouseion.jax.org/ssbb1976/225/
+
+METADATA-ONLY
+
+-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aleph.bib-bvb.de:bvb01-016604343
+base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer
+terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true
+
+SOFT-404 / FORBIDDEN (cookie timeout)
+
+-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bivaldi.gva.es:11740
+base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+
+
+-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/443282
+base_url | https://library.wur.nl/WebQuery/wurpubs/443282
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282
+
+DIGIBIS platform (like some others)
+
+FIXED-PARTIAL
+
+-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:hal:in2p3-00414135v1
+base_url | http://hal.in2p3.fr/in2p3-00414135
+terminal_url | http://hal.in2p3.fr:80/in2p3-00414135
+
+METADATA-ONLY
+
+-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aaltodoc.aalto.fi:123456789/13201
+base_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+
+This specific record is not accessible.
+Another: https://aaltodoc.aalto.fi/handle/123456789/38002
+
+DSpace 5.4
+
+Worked (from recent changes)
+
+
+-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:sedici.unlp.edu.ar:10915/40144
+base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+
+This is a journal! Cool. Plone software platform.
+
+FIXED
+
+## Top no-capture Domains
+
+Top terminal no-capture domains:
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | count
+ -----------------------------------+-------
+ digitalrepository.unm.edu | 94087
+ escholarship.org | 80632
+ ir.opt.ac.cn | 70504
+ idus.us.es | 67908
+ www.cambridge.org | 56376
+ www.ssoar.info | 52534
+ rep.bntu.by | 52127
+ scholarworks.umt.edu | 48546
+ publikationen.ub.uni-frankfurt.de | 46987
+ dk.um.si | 45753
+ repositorio.uladech.edu.pe | 37028
+ uu.diva-portal.org | 34929
+ digitalcommons.law.byu.edu | 31732
+ sedici.unlp.edu.ar | 31233
+ elib.sfu-kras.ru | 29131
+ jyx.jyu.fi | 28144
+ www.repository.cam.ac.uk | 27728
+ nagoya.repo.nii.ac.jp | 26673
+ www.duo.uio.no | 25258
+ www.persee.fr | 24968
+ www2.senado.leg.br | 24426
+ tesis.ucsm.edu.pe | 24049
+ digitalcommons.unl.edu | 21974
+ www.degruyter.com | 21940
+ www.igi-global.com | 20736
+ thekeep.eiu.edu | 20712
+ docs.lib.purdue.edu | 20538
+ repositorio.cepal.org | 20280
+ elib.bsu.by | 19620
+ minds.wisconsin.edu | 19473
+ (30 rows)
+
+These all seem worth crawling. A couple publishers (cambridge.org), and
+persee.fr will probably fail, but not too many URLs.
+
+## Summary of Filtered Prefixes and Domains (OAI-PMH)
+
+oai:kb.dk:
+ too large and generic
+oai:bdr.oai.bsb-muenchen.de:
+ too large and generic
+oai:hispana.mcu.es:
+ too large and generic
+oai:bnf.fr:
+ too large and generic
+oai:ukm.si:
+ too large and generic
+oai:biodiversitylibrary.org:
+ redundant with other ingest and archive.org content
+oai:hsp.org:
+ large; historical content only
+oai:repec:
+ large; mostly (entirely?) links to publisher sites
+oai:n/a:
+ meta?
+oai:quod.lib.umich.edu:
+ entire issues? hard to crawl so skip for now
+oai:hypotheses.org:
+ HTML, not PDF
+oai:americanae.aecid.es:
+ large, complex. skip for now
+oai:www.irgrid.ac.cn:
+ aggregator of other IRs
+oai:espace.library.uq.edu.au:
+ large; metadata only; javascript heavy (poor heritrix crawling)
+oai:edoc.mpg.de:
+ deprecated domain, with no redirects
+oai:bibliotecadigital.jcyl.es:
+ digitized historical docs; hard to crawl, skip for now
+oai:repository.erciyes.edu.tr:
+ gone (domain lapsed)
+oai:krm.or.kr:
+ "research results repository" (metadata only)
+
+www.kb.dk
+ large, general purpose, scope
+kb-images.kb.dk
+ deprecated
+mdz-nbn-resolving.de
+ multiple prefixes end up here. historical docs, scope
+aggr.ukm.um.si
+ large, out of scope
+edoc.mpg.de
+ deprecated domain
+doaj.org
+ index (metadata only)
+orcid.org
+ out of scope
+gateway.isiknowledge.com
+ clarivate login/payall (skipping in ingest)
+
+Needs filtering to a subset of records (by 'set' or other filtering?):
+
+oai:igi.indrastra.com:
+oai:invenio.nusl.cz:
+oai:t2r2.star.titech.ac.jp:
+oai:evastar-karlsruhe.de:
+oai:repository.ust.hk:
+oai:serval.unil.ch:
+oai:pure.atira.dk:
+
+FIlters in SQL syntax:
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+
+and in some contexts (PDFs; switch to HTML):
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+## Overall Summary of OAI-PMH Stuff
+
+Big picture is that the majority of `no-pdf-link` crawl status are because of
+repository scope, record scope, or content format issues. That being said,
+there was a sizable fraction of sites which were platforms (like DSpace) which
+were not ingesting well.
+
+A significant fraction of records are "metadata only" (of papers), or non-paper
+entity types (like persons, grants, or journal titles), and a growing fraction
+(?) are metadata plus link to OA publisher fulltext (offsite). Might be
+possible to detect these at ingest time, or earlier at OAI-PMH
+harvest/transform time and filter them out.
+
+It may be worthwhile to attempt ingest of multiple existing captures
+(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
+(or at least a couple). This is because repository software gets upgraded, so
+old "no-capture" or "not found" or "link loop" type captures may work when
+recrawled.
+
+New summary with additional filters:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 12872279
+ no-pdf-link | 9329602
+ no-capture | 4696362
+ redirect-loop | 1541458
+ terminal-bad-status | 660418
+ link-loop | 452831
+ wrong-mimetype | 434868
+ null-body | 71065
+ cdx-error | 17005
+ | 15275
+ petabox-error | 12743
+ wayback-error | 11759
+ skip-url-blocklist | 182
+ gateway-timeout | 122
+ redirects-exceeded | 120
+ bad-redirect | 117
+ bad-gzip-encoding | 111
+ wayback-content-error | 102
+ timeout | 72
+ blocked-cookie | 62
+ (20 rows)
+
diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md
new file mode 100644
index 0000000..a0bb0c5
--- /dev/null
+++ b/notes/ingest/2021-09-03_daily_improvements.md
@@ -0,0 +1,1021 @@
+
+Periodic check-in of daily crawling/ingest.
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------+--------
+ pdf | no-pdf-link | 158474
+ pdf | spn2-cdx-lookup-failure | 135344
+ pdf | success | 127938
+ pdf | spn2-error | 65411
+ pdf | gateway-timeout | 63112
+ pdf | blocked-cookie | 26338
+ pdf | terminal-bad-status | 24853
+ pdf | link-loop | 15699
+ pdf | spn2-error:job-failed | 13862
+ pdf | redirect-loop | 11432
+ pdf | cdx-error | 2376
+ pdf | too-many-redirects | 2186
+ pdf | wrong-mimetype | 2142
+ pdf | forbidden | 1758
+ pdf | spn2-error:no-status | 972
+ pdf | not-found | 820
+ pdf | bad-redirect | 536
+ pdf | read-timeout | 392
+ pdf | wayback-error | 251
+ pdf | remote-server-error | 220
+ (20 rows)
+
+Hrm, that is a healthy fraction of `no-pdf-link`.
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 39678
+ osf.io | gateway-timeout | 29809
+ acervus.unicamp.br | no-pdf-link | 21978
+ osf.io | terminal-bad-status | 18727
+ zenodo.org | spn2-cdx-lookup-failure | 17008
+ doi.org | spn2-cdx-lookup-failure | 15503
+ www.degruyter.com | no-pdf-link | 15122
+ ieeexplore.ieee.org | spn2-error:job-failed | 12921
+ osf.io | spn2-cdx-lookup-failure | 11123
+ www.tandfonline.com | blocked-cookie | 8096
+ www.morressier.com | no-pdf-link | 4655
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580
+ pubs.acs.org | blocked-cookie | 4415
+ www.frontiersin.org | no-pdf-link | 4163
+ www.degruyter.com | spn2-cdx-lookup-failure | 3788
+ www.taylorfrancis.com | no-pdf-link | 3568
+ www.sciencedirect.com | no-pdf-link | 3128
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116
+ acervus.unicamp.br | spn2-cdx-lookup-failure | 2797
+ www.mdpi.com | spn2-cdx-lookup-failure | 2719
+ brill.com | link-loop | 2681
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 2546
+ apps.crossref.org | no-pdf-link | 2537
+ onlinelibrary.wiley.com | blocked-cookie | 2528
+ (25 rows)
+
+Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status != 'spn2-cdx-lookup-failure'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+
+ domain | status | count
+ -----------------------------------------------------------------+-----------------------+--------
+ academic.oup.com | | 2405
+ academic.oup.com | no-pdf-link | 1240
+ academic.oup.com | link-loop | 1010
+ acervus.unicamp.br | | 21980
+ acervus.unicamp.br | no-pdf-link | 21978 **
+ aclanthology.org | | 208
+ acp.copernicus.org | | 365
+ acp.copernicus.org | success | 356
+ aip.scitation.org | | 1071
+ aip.scitation.org | blocked-cookie | 843
+ aip.scitation.org | redirect-loop | 227
+ apps.crossref.org | | 2537
+ apps.crossref.org | no-pdf-link | 2537
+ arxiv.org | | 17817
+ arxiv.org | success | 17370
+ arxiv.org | terminal-bad-status | 320
+ asmedigitalcollection.asme.org | | 401
+ asmedigitalcollection.asme.org | link-loop | 364
+ assets.researchsquare.com | | 3706
+ assets.researchsquare.com | success | 3706
+ avmj.journals.ekb.eg | | 605
+ avmj.journals.ekb.eg | success | 595
+ bfa.journals.ekb.eg | | 224
+ bfa.journals.ekb.eg | success | 214
+ biorxiv.org | redirect-loop | 895
+ biorxiv.org | | 895
+ birdsoftheworld.org | | 286
+ birdsoftheworld.org | no-pdf-link | 285
+ bmjopen.bmj.com | success | 232
+ bmjopen.bmj.com | | 232
+ books.openedition.org | | 396
+ books.openedition.org | no-pdf-link | 396
+ brill.com | | 4272
+ brill.com | link-loop | 2681
+ brill.com | no-pdf-link | 1410
+ cas.columbia.edu | | 1038
+ cas.columbia.edu | no-pdf-link | 1038 **
+ cdr.lib.unc.edu | | 513
+ cdr.lib.unc.edu | success | 469
+ chemrxiv.org | | 278
+ chemrxiv.org | success | 275
+ classiques-garnier.com | | 531
+ classiques-garnier.com | no-pdf-link | 487 *
+ content.iospress.com | | 275
+ content.iospress.com | link-loop | 230
+ cris.maastrichtuniversity.nl | | 318
+ cris.maastrichtuniversity.nl | success | 284
+ cyberleninka.ru | | 1165
+ cyberleninka.ru | success | 1134
+ deepblue.lib.umich.edu | | 289
+ dergipark.org.tr | | 1185
+ dergipark.org.tr | success | 774
+ dergipark.org.tr | no-pdf-link | 320
+ didaktorika.gr | | 688
+ didaktorika.gr | redirect-loop | 688
+ digi.ub.uni-heidelberg.de | | 292
+ digi.ub.uni-heidelberg.de | no-pdf-link | 292
+ direct.mit.edu | | 236
+ direct.mit.edu | no-pdf-link | 207 *
+ dl.acm.org | | 2319
+ dl.acm.org | blocked-cookie | 2230
+ dmtcs.episciences.org | | 733
+ dmtcs.episciences.org | success | 730
+ doi.ala.org.au | no-pdf-link | 2373 **
+ doi.ala.org.au | | 2373
+ doi.org | | 732
+ doi.org | terminal-bad-status | 673
+ downloads.hindawi.com | success | 1452
+ downloads.hindawi.com | | 1452
+ drive.google.com | | 216
+ drive.google.com | no-pdf-link | 211
+ dtb.bmj.com | | 674
+ dtb.bmj.com | link-loop | 669
+ easy.dans.knaw.nl | no-pdf-link | 261 *
+ easy.dans.knaw.nl | | 261
+ ebooks.marilia.unesp.br | | 688
+ ebooks.marilia.unesp.br | no-pdf-link | 688 *
+ ehp.niehs.nih.gov | | 766
+ ehp.niehs.nih.gov | blocked-cookie | 765
+ ejournal.mandalanursa.org | | 307
+ ejournal.mandalanursa.org | success | 305
+ elib.spbstu.ru | | 264
+ elib.spbstu.ru | redirect-loop | 257
+ elibrary.ru | | 1367
+ elibrary.ru | redirect-loop | 1169
+ elibrary.vdi-verlag.de | | 1251
+ elibrary.vdi-verlag.de | no-pdf-link | 646
+ elibrary.vdi-verlag.de | link-loop | 537
+ elifesciences.org | | 328
+ elifesciences.org | success | 323
+ figshare.com | | 803
+ figshare.com | no-pdf-link | 714 *
+ files.osf.io | | 745
+ files.osf.io | success | 614
+ hammer.purdue.edu | | 244
+ hammer.purdue.edu | no-pdf-link | 243
+ heiup.uni-heidelberg.de | | 277
+ heiup.uni-heidelberg.de | no-pdf-link | 268
+ hkvalidate.perfdrive.com | no-pdf-link | 370 *
+ hkvalidate.perfdrive.com | | 370
+ ieeexplore.ieee.org | | 16675
+ ieeexplore.ieee.org | spn2-error:job-failed | 12927
+ ieeexplore.ieee.org | success | 1952
+ ieeexplore.ieee.org | too-many-redirects | 1193
+ ieeexplore.ieee.org | no-pdf-link | 419
+ jamanetwork.com | | 339
+ jamanetwork.com | success | 216
+ jmstt.ntou.edu.tw | | 244
+ jmstt.ntou.edu.tw | success | 241
+ journal.ipb.ac.id | | 229
+ journal.ipb.ac.id | success | 206
+ journal.nafe.org | | 221
+ journals.aps.org | | 614
+ journals.aps.org | gateway-timeout | 495
+ journals.asm.org | | 463
+ journals.asm.org | blocked-cookie | 435
+ journals.flvc.org | | 230
+ journals.lww.com | | 1300
+ journals.lww.com | link-loop | 1284
+ journals.openedition.org | | 543
+ journals.openedition.org | success | 311
+ journals.ub.uni-heidelberg.de | | 357
+ journals.ub.uni-heidelberg.de | success | 311
+ jov.arvojournals.org | | 431
+ jov.arvojournals.org | no-pdf-link | 422 *
+ kiss.kstudy.com | | 303
+ kiss.kstudy.com | no-pdf-link | 303 *
+ library.iated.org | | 364
+ library.iated.org | redirect-loop | 264
+ library.seg.org | blocked-cookie | 301
+ library.seg.org | | 301
+ link.aps.org | redirect-loop | 442
+ link.aps.org | | 442
+ linkinghub.elsevier.com | | 515
+ linkinghub.elsevier.com | gateway-timeout | 392
+ mc.sbm.org.br | | 224
+ mc.sbm.org.br | success | 224
+ mdpi-res.com | | 742
+ mdpi-res.com | success | 742
+ mdsoar.org | | 220
+ mediarep.org | | 269
+ mediarep.org | success | 264
+ medrxiv.org | redirect-loop | 290
+ medrxiv.org | | 290
+ muse.jhu.edu | | 429
+ muse.jhu.edu | terminal-bad-status | 391
+ mvmj.journals.ekb.eg | | 306
+ oapub.org | | 292
+ oapub.org | success | 289
+ onepetro.org | | 426
+ onepetro.org | link-loop | 406
+ onlinelibrary.wiley.com | | 2835
+ onlinelibrary.wiley.com | blocked-cookie | 2531
+ onlinelibrary.wiley.com | redirect-loop | 264
+ open.library.ubc.ca | | 569
+ open.library.ubc.ca | no-pdf-link | 425 *
+ opendata.uni-halle.de | | 407
+ opendata.uni-halle.de | success | 263
+ osf.io | | 49022
+ osf.io | gateway-timeout | 29810
+ osf.io | terminal-bad-status | 18731
+ osf.io | spn2-error | 247
+ osf.io | not-found | 205
+ oxford.universitypressscholarship.com | | 392
+ oxford.universitypressscholarship.com | link-loop | 233
+ panor.ru | no-pdf-link | 433 *
+ panor.ru | | 433
+ papers.ssrn.com | | 1630
+ papers.ssrn.com | link-loop | 1598
+ pdf.sciencedirectassets.com | | 3063
+ pdf.sciencedirectassets.com | success | 3063
+ peerj.com | | 464
+ peerj.com | no-pdf-link | 303 *
+ periodicos.ufpe.br | | 245
+ periodicos.ufpe.br | success | 232
+ periodicos.unb.br | | 230
+ periodicos.unb.br | success | 221
+ preprints.jmir.org | | 548
+ preprints.jmir.org | cdx-error | 499
+ publications.rwth-aachen.de | | 213
+ publikationen.bibliothek.kit.edu | | 346
+ publikationen.bibliothek.kit.edu | success | 314
+ publikationen.uni-tuebingen.de | | 623
+ publikationen.uni-tuebingen.de | no-pdf-link | 522 *
+ publons.com | no-pdf-link | 934 *
+ publons.com | | 934
+ pubs.acs.org | | 4507
+ pubs.acs.org | blocked-cookie | 4406
+ pubs.rsc.org | | 1638
+ pubs.rsc.org | link-loop | 1054
+ pubs.rsc.org | redirect-loop | 343
+ pubs.rsc.org | success | 201
+ repositorio.ufu.br | | 637
+ repositorio.ufu.br | success | 607
+ repository.dri.ie | | 1852
+ repository.dri.ie | no-pdf-link | 1852 **
+ repository.library.brown.edu | | 293
+ repository.library.brown.edu | no-pdf-link | 291 *
+ res.mdpi.com | | 10367
+ res.mdpi.com | success | 10360
+ retrovirology.biomedcentral.com | | 230
+ revistas.ufrj.br | | 284
+ revistas.ufrj.br | success | 283
+ revistas.uptc.edu.co | | 385
+ revistas.uptc.edu.co | success | 344
+ royalsocietypublishing.org | | 231
+ rsdjournal.org | | 347
+ rsdjournal.org | success | 343
+ s3-ap-southeast-2.amazonaws.com | | 400
+ s3-ap-southeast-2.amazonaws.com | success | 392
+ s3-eu-west-1.amazonaws.com | | 2096
+ s3-eu-west-1.amazonaws.com | success | 2091
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286
+ s3.ca-central-1.amazonaws.com | | 202
+ sage.figshare.com | | 242
+ sage.figshare.com | no-pdf-link | 241
+ sajeb.org | | 246
+ sajeb.org | no-pdf-link | 243
+ scholar.dkyobobook.co.kr | | 332
+ scholar.dkyobobook.co.kr | no-pdf-link | 328 *
+ search.mandumah.com | | 735
+ search.mandumah.com | redirect-loop | 726
+ secure.jbs.elsevierhealth.com | | 1112
+ secure.jbs.elsevierhealth.com | blocked-cookie | 1108
+ stm.bookpi.org | no-pdf-link | 468 *
+ stm.bookpi.org | | 468
+ storage.googleapis.com | | 1012
+ storage.googleapis.com | success | 1012
+ tandf.figshare.com | | 469
+ tandf.figshare.com | no-pdf-link | 466
+ teses.usp.br | | 739
+ teses.usp.br | success | 730
+ tidsskrift.dk | | 360
+ tidsskrift.dk | success | 346
+ tiedejaedistys.journal.fi | | 224
+ tind-customer-agecon.s3.amazonaws.com | success | 332
+ tind-customer-agecon.s3.amazonaws.com | | 332
+ valep.vc.univie.ac.at | no-pdf-link | 280
+ valep.vc.univie.ac.at | | 280
+ watermark.silverchair.com | | 1729
+ watermark.silverchair.com | success | 1719
+ www.academia.edu | | 387
+ www.academia.edu | no-pdf-link | 386
+ www.ahajournals.org | | 430
+ www.ahajournals.org | blocked-cookie | 413
+ www.atenaeditora.com.br | | 572
+ www.atenaeditora.com.br | terminal-bad-status | 513
+ www.atlantis-press.com | success | 722
+ www.atlantis-press.com | | 722
+ www.aup-online.com | | 419
+ www.aup-online.com | no-pdf-link | 419 *
+ www.beck-elibrary.de | | 269
+ www.beck-elibrary.de | no-pdf-link | 268 *
+ www.biodiversitylibrary.org | no-pdf-link | 528 *
+ www.biodiversitylibrary.org | | 528
+ www.bloomsburycollections.com | | 623
+ www.bloomsburycollections.com | no-pdf-link | 605 *
+ www.cabi.org | | 2191
+ www.cabi.org | no-pdf-link | 2186 *
+ www.cairn.info | | 1283
+ www.cairn.info | no-pdf-link | 713
+ www.cairn.info | link-loop | 345
+ www.cambridge.org | | 4128
+ www.cambridge.org | no-pdf-link | 1531
+ www.cambridge.org | success | 1441
+ www.cambridge.org | link-loop | 971
+ www.cureus.com | no-pdf-link | 526 *
+ www.cureus.com | | 526
+ www.dbpia.co.kr | | 637
+ www.dbpia.co.kr | redirect-loop | 631
+ www.deboni.he.com.br | | 382
+ www.deboni.he.com.br | success | 381
+ www.degruyter.com | | 17783
+ www.degruyter.com | no-pdf-link | 15102
+ www.degruyter.com | success | 2584
+ www.dovepress.com | | 480
+ www.dovepress.com | success | 472
+ www.e-manuscripta.ch | | 1350
+ www.e-manuscripta.ch | no-pdf-link | 1350 *
+ www.e-periodica.ch | | 1276
+ www.e-periodica.ch | no-pdf-link | 1275
+ www.e-rara.ch | | 202
+ www.e-rara.ch | no-pdf-link | 202
+ www.elgaronline.com | | 495
+ www.elgaronline.com | link-loop | 290
+ www.elibrary.ru | | 922
+ www.elibrary.ru | no-pdf-link | 904
+ www.emerald.com | | 2155
+ www.emerald.com | no-pdf-link | 1936 *
+ www.emerald.com | success | 219
+ www.eurekaselect.com | | 518
+ www.eurekaselect.com | no-pdf-link | 516 *
+ www.frontiersin.org | | 4163
+ www.frontiersin.org | no-pdf-link | 4162 **
+ www.hanser-elibrary.com | | 444
+ www.hanser-elibrary.com | blocked-cookie | 444
+ www.hanspub.org | | 334
+ www.hanspub.org | no-pdf-link | 314
+ www.idunn.no | | 1736
+ www.idunn.no | link-loop | 596
+ www.idunn.no | success | 577
+ www.idunn.no | no-pdf-link | 539
+ www.igi-global.com | terminal-bad-status | 458
+ www.igi-global.com | | 458
+ www.ijcai.org | | 533
+ www.ijcai.org | success | 532
+ www.ijraset.com | success | 385
+ www.ijraset.com | | 385
+ www.inderscience.com | | 712
+ www.inderscience.com | no-pdf-link | 605 *
+ www.ingentaconnect.com | | 456
+ www.ingentaconnect.com | no-pdf-link | 413 *
+ www.internationaljournalssrg.org | | 305
+ www.internationaljournalssrg.org | no-pdf-link | 305 *
+ www.isca-speech.org | | 2392
+ www.isca-speech.org | no-pdf-link | 2391 **
+ www.journals.uchicago.edu | | 228
+ www.journals.uchicago.edu | blocked-cookie | 227
+ www.jstage.jst.go.jp | | 1492
+ www.jstage.jst.go.jp | success | 1185
+ www.jstage.jst.go.jp | no-pdf-link | 289
+ www.jstor.org | | 301
+ www.jurology.com | | 887
+ www.jurology.com | redirect-loop | 887
+ www.karger.com | | 318
+ www.liebertpub.com | | 507
+ www.liebertpub.com | blocked-cookie | 496
+ www.morressier.com | | 4781
+ www.morressier.com | no-pdf-link | 4655 **
+ www.ncl.ecu.edu | | 413
+ www.ncl.ecu.edu | success | 413
+ www.nomos-elibrary.de | | 526
+ www.nomos-elibrary.de | no-pdf-link | 391
+ www.oecd-ilibrary.org | no-pdf-link | 1170 **
+ www.oecd-ilibrary.org | | 1170
+ www.openagrar.de | no-pdf-link | 221
+ www.openagrar.de | | 221
+ www.osapublishing.org | | 900
+ www.osapublishing.org | link-loop | 615
+ www.osapublishing.org | no-pdf-link | 269
+ www.osti.gov | | 630
+ www.osti.gov | link-loop | 573
+ www.oxfordlawtrove.com | no-pdf-link | 476 *
+ www.oxfordlawtrove.com | | 476
+ www.pdcnet.org | | 298
+ www.pdcnet.org | terminal-bad-status | 262
+ www.pedocs.de | | 203
+ www.pnas.org | | 222
+ www.preprints.org | | 372
+ www.preprints.org | success | 366
+ www.repository.cam.ac.uk | | 801
+ www.repository.cam.ac.uk | success | 359
+ www.repository.cam.ac.uk | no-pdf-link | 239
+ www.research-collection.ethz.ch | | 276
+ www.research-collection.ethz.ch | terminal-bad-status | 274
+ www.revistas.usp.br | | 207
+ www.revistas.usp.br | success | 204
+ www.rina.org.uk | no-pdf-link | 1009 **
+ www.rina.org.uk | | 1009
+ www.schweizerbart.de | no-pdf-link | 202
+ www.schweizerbart.de | | 202
+ www.scielo.br | | 544
+ www.scielo.br | redirect-loop | 526
+ www.sciencedirect.com | | 3901
+ www.sciencedirect.com | no-pdf-link | 3127 **
+ www.sciencedirect.com | link-loop | 701
+ www.sciendo.com | | 384
+ www.sciendo.com | success | 363
+ www.sciengine.com | | 225
+ www.scirp.org | | 209
+ www.spandidos-publications.com | | 205
+ www.tandfonline.com | | 8925
+ www.tandfonline.com | blocked-cookie | 8099
+ www.tandfonline.com | terminal-bad-status | 477
+ www.tandfonline.com | redirect-loop | 322
+ www.taylorfrancis.com | | 6119
+ www.taylorfrancis.com | no-pdf-link | 3567
+ www.taylorfrancis.com | link-loop | 2169
+ www.taylorfrancis.com | terminal-bad-status | 353
+ www.thieme-connect.de | | 1047
+ www.thieme-connect.de | redirect-loop | 472
+ www.thieme-connect.de | spn2-error:job-failed | 343
+ www.tib.eu | | 206
+ www.trp.org.in | | 311
+ www.trp.org.in | success | 311
+ www.un-ilibrary.org | no-pdf-link | 597 *
+ www.un-ilibrary.org | | 597
+ www.vr-elibrary.de | | 775
+ www.vr-elibrary.de | blocked-cookie | 774
+ www.wjgnet.com | | 204
+ www.wjgnet.com | no-pdf-link | 204
+ www.worldscientific.com | | 974
+ www.worldscientific.com | blocked-cookie | 971
+ www.worldwidejournals.com | | 242
+ www.worldwidejournals.com | no-pdf-link | 203
+ www.wto-ilibrary.org | no-pdf-link | 295
+ www.wto-ilibrary.org | | 295
+ www.zora.uzh.ch | | 222
+ zenodo.org | | 49460
+ zenodo.org | no-pdf-link | 39721
+ zenodo.org | success | 8954
+ zenodo.org | wrong-mimetype | 562
+ | | 445919
+ | no-pdf-link | 168035
+ | success | 140875
+ | gateway-timeout | 31809
+ | blocked-cookie | 26431
+ | terminal-bad-status | 25625
+ | link-loop | 19006
+ | spn2-error:job-failed | 13962
+ | redirect-loop | 12512
+ | wrong-mimetype | 2302
+ | spn2-error | 1689
+ | too-many-redirects | 1203
+ | bad-redirect | 732
+ | cdx-error | 539
+ | not-found | 420
+ | spn2-error:no-status | 256
+ (419 rows)
+
+Get random subsets by terminal domain:
+
+ \x auto
+ SELECT
+ ingest_request.link_source_id AS link_source_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%'
+ ORDER BY random()
+ LIMIT 5;
+
+## acervus.unicamp.br
+
+Previously flagged as messy (2021-05_daily_improvements.md)
+
+## cas.columbia.edu
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-2ety-qm51
+base_url | https://doi.org/10.7916/d8-2ety-qm51
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-0zf6-d167
+base_url | https://doi.org/10.7916/d8-0zf6-d167
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-k6ha-sn43
+base_url | https://doi.org/10.7916/d8-k6ha-sn43
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-bj6t-eb07
+base_url | https://doi.org/10.7916/d8-bj6t-eb07
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-xjac-j502
+base_url | https://doi.org/10.7916/d8-xjac-j502
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+
+these are not public (loginwalls)
+
+DONE: '/login?TARGET=' as a login wall pattern
+
+## doi.ala.org.au
+
+Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md)
+
+NOTE: look at ingesting datasets
+
+## www.isca-speech.org
+
+-[ RECORD 1 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2014-84
+base_url | https://doi.org/10.21437/interspeech.2014-84
+terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html
+-[ RECORD 2 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2004-319
+base_url | https://doi.org/10.21437/interspeech.2004-319
+terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html
+-[ RECORD 3 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-372
+base_url | https://doi.org/10.21437/interspeech.2006-372
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html
+-[ RECORD 4 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2015-588
+base_url | https://doi.org/10.21437/interspeech.2015-588
+terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html
+-[ RECORD 5 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-468
+base_url | https://doi.org/10.21437/interspeech.2006-468
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html
+
+Bespoke site. Added rule to sandcrawler.
+
+NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?)
+
+## www.morressier.com
+
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0002858v
+base_url | https://doi.org/10.1115/1.0002858v
+terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0003896v
+base_url | https://doi.org/10.1115/1.0003896v
+terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0004476v
+base_url | https://doi.org/10.1115/1.0004476v
+terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0001286v
+base_url | https://doi.org/10.1115/1.0001286v
+terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0000315v
+base_url | https://doi.org/10.1115/1.0000315v
+terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874
+
+Many of these seem to be presentations, as both video and slides. PDFs seem broken though.
+
+NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data)
+
+## www.oecd-ilibrary.org
+
+Paywall (2021-05_daily_improvements.md)
+
+## www.rina.org.uk
+
+-[ RECORD 1 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.ws.2002.10
+base_url | https://doi.org/10.3940/rina.ws.2002.10
+terminal_url | https://www.rina.org.uk/showproducts.html?product=4116
+-[ RECORD 2 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.pass.2003.16
+base_url | https://doi.org/10.3940/rina.pass.2003.16
+terminal_url | https://www.rina.org.uk/showproducts.html?product=3566
+-[ RECORD 3 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin.2013.15
+base_url | https://doi.org/10.3940/rina.icsotin.2013.15
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8017
+-[ RECORD 4 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.wfa.2010.23
+base_url | https://doi.org/10.3940/rina.wfa.2010.23
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8177
+-[ RECORD 5 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin15.2015.01
+base_url | https://doi.org/10.3940/rina.icsotin15.2015.01
+terminal_url | https://www.rina.org.uk/showproducts.html?product=7883
+
+Site is broken in some way
+
+## www.sciencedirect.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.jhlste.2021.100332
+base_url | https://doi.org/10.1016/j.jhlste.2021.100332
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.hazadv.2021.100006
+base_url | https://doi.org/10.1016/j.hazadv.2021.100006
+terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-12-822844-9.00009-8
+base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.colcom.2021.100490
+base_url | https://doi.org/10.1016/j.colcom.2021.100490
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-323-85245-6.00012-6
+base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126
+
+These no-pdf-url ones seem to just be not OA, which is expected for much of the
+domain.
+
+## repository.dri.ie
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+---------------------------------------------
+ 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941
+ 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f
+ 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102
+ 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t
+ 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726
+
+"Digital repository of Ireland"
+
+Historical scanned content. Bespoke site. Fixed.
+
+NOTE: recrawl/retry this domain
+
+## www.frontiersin.org
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/978-2-88971-147-5
+base_url | https://doi.org/10.3389/978-2-88971-147-5
+terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fnins.2021.722592
+base_url | https://doi.org/10.3389/fnins.2021.722592
+terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fcell.2021.683209
+base_url | https://doi.org/10.3389/fcell.2021.683209
+terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fmicb.2021.692474
+base_url | https://doi.org/10.3389/fmicb.2021.692474
+terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fneur.2021.676527
+base_url | https://doi.org/10.3389/fneur.2021.676527
+terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full
+
+All the `/research-topics/` URLs are out of scope.
+
+NOTE: recrawl missing frontiersin.org articles for PDFs
+NOTE: recrawl missing frontiersin.org articles for XML (?)
+
+-------
+
+## direct.mit.edu
+
+Previously "not available" (2021-05_daily_improvements.md)
+
+## figshare.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15052236.v6
+base_url | https://doi.org/10.6084/m9.figshare.15052236.v6
+terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.14907846.v5
+base_url | https://doi.org/10.6084/m9.figshare.14907846.v5
+terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15157614.v1
+base_url | https://doi.org/10.6084/m9.figshare.15157614.v1
+terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15172926.v1
+base_url | https://doi.org/10.6084/m9.figshare.15172926.v1
+terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.16532574.v1
+base_url | https://doi.org/10.6084/m9.figshare.16532574.v1
+terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1
+
+NOTE: can determine from the redirect URL, I guess. This is helpful for ingest!
+Could also potentially correct fatcat release_type using this info.
+
+We seem to be getting the ones we can (eg, papers) just fine
+
+## hkvalidate.perfdrive.com
+
+Should be skipping/bailing on this domain, but not for some reason.
+
+-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05cc
+base_url | https://doi.org/10.3847/1538-4357/ac05cc
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 2 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac0429
+base_url | https://doi.org/10.3847/1538-4357/ac0429
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 3 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1149/1945-7111/ac1a85
+base_url | https://doi.org/10.1149/1945-7111/ac1a85
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 4 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.35848/1882-0786/ac1b0d
+base_url | https://doi.org/10.35848/1882-0786/ac1b0d
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 5 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05ba
+base_url | https://doi.org/10.3847/1538-4357/ac05ba
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+
+Was failing to check against blocklist again at the end of attempts.
+
+Could retry all these to update status, but probably not worth it.
+
+## jov.arvojournals.org
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+-------------------------------------------------------------
+ 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021
+ 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561
+ 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057
+ 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793
+ 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441
+
+These seem to just not be published/available yet.
+
+But they also use watermark.silverchair.com
+
+NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest
+NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix
+
+## kiss.kstudy.com
+
+Previously unable to download (2021-05_daily_improvements.md)
+
+## open.library.ubc.ca
+
+ link_source_id | base_url | terminal_url
+--------------------+------------------------------------+----------------------------------------------------------------------------------
+ 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664
+ 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189
+ 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+ 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994
+ 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312
+
+Historical newspapers, out of scope?
+
+Video content:
+https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+
+Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+
+NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+
+
+## panor.ru
+
+ link_source_id | base_url | terminal_url
+-------------------------+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html
+ 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html
+ 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html
+ 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html
+ 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html
+
+"The full version of the article is available only to subscribers of the journal"
+
+Paywall
+
+## peerj.com
+
+Previously: this is HTML of reviews (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope
+
+## publons.com
+
+Previously: this is HTML (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope (length of works)
+
+## stm.bookpi.org
+
+ link_source_id | base_url | terminal_url
+-----------------------------+---------------------------------------------+----------------------------------------------------
+ 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231
+ 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096
+ 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330
+ 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810
+ 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274
+
+These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref
+
+## www.cabi.org
+
+ link_source_id | base_url | terminal_url
+--------------------------+------------------------------------------+----------------------------------------------------
+ 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742
+ 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471
+ 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544
+ 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117
+ 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337
+
+Reviews? but just abstracts?
+
+## www.cureus.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17547
+base_url | https://doi.org/10.7759/cureus.17547
+terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16867
+base_url | https://doi.org/10.7759/cureus.16867
+terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17425
+base_url | https://doi.org/10.7759/cureus.17425
+terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17313
+base_url | https://doi.org/10.7759/cureus.17313
+terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16943
+base_url | https://doi.org/10.7759/cureus.16943
+terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed
+
+Ugh, stupid "email to get PDF". but ingest seems to work anyways?
+
+NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar)
+
+## www.e-manuscripta.ch
+
+ link_source_id | base_url | terminal_url
+------------------------------+----------------------------------------------+-------------------------------------------------------------------
+ 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031
+ 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064
+ 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176
+ 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200
+ 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008
+
+Historical docs, single pages, but do have full PDF downloads.
+
+NOTE: re-ingest
+
+## www.inderscience.com
+
+Previously: paywall (2021-05_daily_improvements.md)
+
+## www.un-ilibrary.org
+
+ link_source_id | base_url | terminal_url
+----------------------------+--------------------------------------------+-------------------------------------------------------------
+ 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307
+ 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011
+ 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014
+ 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020
+ 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005
+
+Books and chapters. Doesn't seem to have actual download ability?
+
+# Re-Ingest / Re-Crawl
+
+Using fatcat-ingest helper tool.
+
+- www.isca-speech.org doi_prefix:10.21437
+ doi:* doi_prefix:10.21437 in_ia:false
+ 9,233
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json
+ => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221})
+- repository.dri.ie doi_prefix:10.7486
+ doi:* in_ia:false doi_prefix:10.7486
+ 56,532
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json
+ => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532})
+- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link)
+ 25,598
+ many are meeting abstracts
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json
+ => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598})
+- www.cureus.com doi_prefix:10.7759
+ 1,537
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json
+ => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535})
+- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta
+ 110,945
+ TODO: all are marked 'unpublished', but that is actually probably right?
+- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!)
+ doi:* in_ia:false doi_prefix:10.3389
+ 212,370
+ doi:10.3389/conf.* => most seem to be just abstracts? how many like this?
+ container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k)
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 191k
+ but many might be components? this is actually kind of a mess
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 19.2k
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json
+
+# Remaining Tasks / Domains (TODO)
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
new file mode 100644
index 0000000..d36f427
--- /dev/null
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -0,0 +1,678 @@
+
+Going to run a combined crawl for `no-capture`, `no-pdf-link` and similar URL
+statuses.
+
+As a reminder, significant refactor of PDF URL extraction happened around
+Oct/Nov 2020, so things not re-ingested since then should be retried.
+
+1. first bulk re-process `no-pdf-link` statuses from OAI-PMH crawl past OA DOI past crawls
+2. then heritrix crawl of old URLs from all sources (see status codes below)
+3. bulk ingest specific sources and statuses (see below)
+
+Status codes to crawl, with potentially split separate batches:
+
+ no-capture
+ IA errors
+ cdx-error
+ wayback-error
+ wayback-content-error
+ petabox-error
+ spn2-cdx-lookup-failure
+ gateway-timeout
+
+Then, bulk ingest from these sources matching the above patterns, in this order:
+
+- OA DOI (fatcat-ingest or fatcat-changelog source; will result in import)
+- unpaywall (will result in import)
+- OAI-PMH
+- MAG
+
+Current combined domain skip list (SQL filter syntax), for which we don't want
+to bother retrying:
+
+ '%journals.sagepub.com%'
+ '%pubs.acs.org%'
+ '%ahajournals.org%'
+ '%www.journal.csj.jp%'
+ '%aip.scitation.org%'
+ '%academic.oup.com%'
+ '%tandfonline.com%'
+ '%://orcid.org/%'
+ '%://doaj.org/%'
+ '%://archive.org/%'
+ '%://web.archive.org/%'
+ '%://www.archive.org/%'
+
+## DOI Ingest Status (2021-09-08)
+
+Recently did some analysis of OAI-PMH overall status, so can re-do comparisons
+there easily. What about overall DOI ingest? Would like counts so we can
+compare before/after.
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ no-pdf-link | 10516478
+ success | 5690862
+ redirect-loop | 1827192
+ no-capture | 1215179
+ terminal-bad-status | 650104
+ link-loop | 610251
+ blocked-cookie | 353681
+ gateway-timeout | 341319
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 282955
+ not-found | 273667
+ cdx-error | 269082
+ skip-url-blocklist | 265689
+ spn2-error | 87759
+ wrong-mimetype | 68993
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54152
+ spn2-wayback-error | 51752
+ remote-server-error | 45683
+ (20 rows)
+
+## `no-pdf-link` re-try bulk ingest
+
+Specifically for past OAI-PMH and OA DOI crawls.
+
+What are top terminal domains that would be retried? So that we can filter out
+large ones we don't want to bother retrying.
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 40;
+
+ domain | count
+ ---------------------------------------+--------
+ ssl.fao.org | 862277
+ www.e-periodica.ch | 828110
+ zenodo.org | 686701
+ plutof.ut.ee | 685440
+ www.gbif.org | 669727
+ dlc.library.columbia.edu | 536018
+ figshare.com | 383181
+ juser.fz-juelich.de | 351519
+ statisticaldatasets.data-planet.com | 320415
+ espace.library.uq.edu.au | 310767
+ invenio.nusl.cz | 309731
+ doi.pangaea.de | 306311
+ igi.indrastra.com | 297872
+ bib-pubdb1.desy.de | 273565
+ t2r2.star.titech.ac.jp | 271907
+ digi.ub.uni-heidelberg.de | 265519
+ www.sciencedirect.com | 263847
+ publikationen.bibliothek.kit.edu | 229960
+ www.plate-archive.org | 209231
+ www.degruyter.com | 189776
+ spectradspace.lib.imperial.ac.uk:8443 | 187086
+ hal.archives-ouvertes.fr | 185513
+ open.library.ubc.ca | 172821
+ lup.lub.lu.se | 170063
+ books.openedition.org | 169501
+ orbi.uliege.be | 161443
+ freidok.uni-freiburg.de | 150310
+ library.wur.nl | 124318
+ digital.library.pitt.edu | 116406
+ www.research.manchester.ac.uk | 115869
+ www.bibliotecavirtualdeandalucia.es | 114527
+ repository.tue.nl | 112157
+ www.google.com | 111569
+ easy.dans.knaw.nl | 109608
+ springernature.figshare.com | 108597
+ nbn-resolving.org | 107544
+ scholarbank.nus.edu.sg | 107299
+ bibliotecavirtualdefensa.es | 105501
+ biblio.ugent.be | 100854
+ ruj.uj.edu.pl | 99500
+ (40 rows)
+
+For a number of these domains, we do not expect any PDFs to be found, but are
+going to re-ingest anyways so they get marked as 'blocked-*' in result table:
+
+- ssl.fao.org
+- plutof.ut.ee
+- www.gbif.org
+
+But some we are just going to skip anyways, because there *could* be PDFs, but
+probably *aren't*:
+
+- zenodo.org
+- t2r2.star.titech.ac.jp
+- www.google.com
+- figshare.com
+- springernature.figshare.com
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json';
+ => COPY 18040676
+
+Transform and start ingest:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json
+ => 18.0M 0:06:45 [44.5k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
+
+## Progress Check
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 13258356
+ no-pdf-link | 8685519
+ no-capture | 4765663
+ redirect-loop | 1557731
+ terminal-bad-status | 803373
+ link-loop | 453999
+ wrong-mimetype | 440230
+ null-body | 71457
+ cdx-error | 18426
+ | 15275
+ petabox-error | 13408
+ wayback-error | 11845
+ blocked-cookie | 11580
+ skip-url-blocklist | 7761
+ wayback-content-error | 383
+ spn2-cdx-lookup-failure | 362
+ gateway-timeout | 320
+ body-too-large | 207
+ spn2-error:job-failed | 191
+ redirects-exceeded | 120
+ (20 rows)
+
+OAI-PMH compared to a couple weeks ago:
+
+ 13258356-12872279 = +386,077 success
+ 8685519-9329602 = -644,083 no-pdf-link
+ 4765663-4696362 = +69,301 no-capture
+ 803373-660418 = +142,955 terminal-bad-status
+
+OA DOI ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------+---------
+ no-pdf-link | 6693547
+ success | 5979016
+ skip-url-blocklist | 3080986
+ no-capture | 1876914
+ redirect-loop | 1872817
+ terminal-bad-status | 656674
+ link-loop | 624290
+ blocked-cookie | 448001
+ gateway-timeout | 351896
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 301312
+ cdx-error | 279766
+ not-found | 273667
+ wrong-mimetype | 83289
+ spn2-error | 76806
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54278
+ spn2-wayback-error | 51768
+ remote-server-error | 45683
+ (20 rows)
+
+OA DOI changes:
+
+ 5979016-5690862 = +288,154 success
+ 6693547-10516478 = -3,822,931 no-pdf-link (still many!)
+ 1876914-1215179 = +661,735 no-capture
+ 3080986-265689 = +2,815,297 skip-url-blocklist
+
+Overall about half a million new 'success', pretty good. over 750k new
+no-capture for crawling.
+
+## Seedlist Dumps
+
+Note that this is just seedlists, not full ingest requests.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ ) TO '/srv/sandcrawler/tasks/patch_2021-09-16_terminal_seedlist.txt';
+ => 6,354,365
+
+Then run the actual patch crawl!
+
+## Ingest Requests for Bulk Retry (2022-01-06)
+
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => TIMEDOUT
+ => (probably due to re-assignment)
+ => DONE
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+On 2022-02-08:
+
+ status | count
+ -----------------------+----------
+ success | 13505143
+ no-pdf-link | 8741007
+ no-capture | 4429986
+ redirect-loop | 1566611
+ terminal-bad-status | 816162
+ link-loop | 459006
+ wrong-mimetype | 448983
+ null-body | 71871
+ cdx-error | 19055
+ | 15275
+ petabox-error | 11713
+ blocked-cookie | 11664
+ wayback-error | 8745
+ skip-url-blocklist | 7828
+ max-hops-exceeded | 2031
+ wayback-content-error | 338
+ body-too-large | 280
+ spn2-error:job-failed | 191
+ bad-redirect | 134
+ redirects-exceeded | 120
+ (20 rows)
+
+
+On 2022-02-28, after bulk ingest completed:
+
+ status | count
+ -----------------------+----------
+ success | 14668123
+ no-pdf-link | 8822460
+ no-capture | 2987565
+ redirect-loop | 1629015
+ terminal-bad-status | 917851
+ wrong-mimetype | 466512
+ link-loop | 460941
+ null-body | 71457
+ cdx-error | 19636
+ petabox-error | 16198
+ | 15275
+ blocked-cookie | 11885
+ wayback-error | 8779
+ skip-url-blocklist | 7838
+ empty-blob | 5906
+ max-hops-exceeded | 5563
+ wayback-content-error | 355
+ body-too-large | 329
+ spn2-error:job-failed | 191
+ bad-redirect | 137
+ (20 rows)
+
+
+Comparing to a couple months ago:
+
+ 14668123-13258356 = +1,409,767 success
+ 8822460-8685519 = + 136,941 no-pdf-link
+ 2987565-4765663 = -1,778,098 no-capture
+ 917851-803373 = + 114,478 terminal-bad-status
+
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..786c3b2
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,504 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+ wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+ 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+ 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+ cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+ # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+ | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+ | rg '10\.5281/zenodo' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+ | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+ | shuf \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+ | rg '10\.7910/dvn/[a-z0-9]{6}' \
+ | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/CAT_DATASET",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+ "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+ }
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+ "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+ }
+
+ # paste and then Ctrl-D:
+ cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+ # first a small sample
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | head -n5 \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+ # ok, run the whole batch through
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+ internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+ AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+ internetarchive.upload
+ [...]
+ ConnectionResetError: [Errno 104] Connection reset by peer
+ urllib3.exceptions.ProtocolError
+ requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+ r.raise_for_status()
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
+ raise HTTPError(http_error_msg, response=self)
+ requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+ mimetype = magic.Magic(mime=True).from_file(path)
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+ with _real_open(filename):
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+ obj_latest = obj["data"]["latestVersion"]
+ KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+ git log | head -n1
+ # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+ Wed Dec 15 21:57:42 UTC 2021
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+ # 3.76k 15:12:53 [68.7m/s]
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+ wc -l ingest_dataset_combined_results*.json
+ 126 ingest_dataset_combined_results2.json
+ 153 ingest_dataset_combined_results3.json
+ 275 ingest_dataset_combined_results4.json
+ 3762 ingest_dataset_combined_results5.json
+ 7736 ingest_dataset_combined_results6.json
+ 182 ingest_dataset_combined_results.json
+ 5 ingest_dataset_combined_results.ramp.json
+ 12239 total
+
+ cat ingest_dataset_combined_results*.json \
+ | rg '^\{' \
+ | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+ | sort \
+ | uniq --check-chars 26 \
+ | cut -f2 \
+ | rg -v '\\\\' \
+ | pv -l \
+ > uniq_ingest_dataset_combined_results.json
+ # 9.48k 0:00:06 [1.54k/s]
+
+ cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+ 7941 no-capture
+ 374 platform-404
+ 369 terminal-bad-status
+ 348 success-file
+ 172 success
+ 79 platform-scope
+ 77 error-platform-download
+ 47 empty-manifest
+ 27 platform-restricted
+ 20 too-many-files
+ 12 redirect-loop
+ 6 error-archiveorg-upload
+ 3 too-large-size
+ 3 mismatch
+ 1 no-platform-match
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success") | .' -c \
+ > uniq_ingest_dataset_combined_results.success.json
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success-file") | .' -c \
+ > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+ git log | head -n1
+ # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+ head /tmp/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+Trying again 2022-03-23:
+
+ git log | head -n1
+ # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+ # Expecting 11264787 release objects in search queries
+ # TIMEOUT ERROR
+ # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+ zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+ | jq .base_url -r \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+ cat ingest_dataset_combined_results6.json \
+ | rg '"no-capture"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request -c \
+ | pv -l \
+ > ingest_dataset_retry.json
+ => 6.51k 0:00:01 [3.55k/s]
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
+## Retries (2022-02)
+
+Finally got things to complete end to end for this batch!
+
+ cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr
+ 3220 terminal-bad-status
+ 2120 no-capture
+ 380 empty-manifest
+ 264 success-file
+ 251 success
+ 126 success-existing
+ 39 mismatch
+ 28 error-platform-download
+ 24 too-many-files
+ 20 platform-scope
+ 13 platform-restricted
+ 13 mismatch-size
+ 6 too-large-size
+ 3 transfer-encoding-error
+ 2 no-platform-match
+ 2 error-archiveorg-upload
+ 1 redirect-loop
+ 1 empty-blob
+
+Some more URLs to crawl:
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt
+ # 1.00
+ # just a single DOI that failed to crawl, for whatever reason
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt
+
+These are ready to crawl, in the existing dataset crawl.
+
+ cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+ # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+ # skip_cleanup_local_files=True is still default
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+ # filter out zenodo, very slow:
+ # rg -v 10.5281 \
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..941519f
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,398 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below),
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+ => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+ cat patch_terminal_url.2022-01-12.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-01-12.uniq.txt
+ => 5.73M 0:00:47 [ 120k/s]
+
+ # note: tweaks and re-ran the above after inspecting this output
+ cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 799045 doi.org
+ 317557 linkinghub.elsevier.com
+ 211091 arxiv.org
+ 204334 iopscience.iop.org
+ 139758 dialnet.unirioja.es
+ 130331 www.scielo.br
+ 124626 www.persee.fr
+ 85764 digitalrepository.unm.edu
+ 83913 www.mdpi.com
+ 79662 www.degruyter.com
+ 75703 www.e-periodica.ch
+ 72206 dx.doi.org
+ 69068 escholarship.org
+ 67848 idus.us.es
+ 57907 zenodo.org
+ 56624 ir.opt.ac.cn
+ 54983 projecteuclid.org
+ 52226 rep.bntu.by
+ 48376 osf.io
+ 48009 pubs.rsc.org
+ 46947 publikationen.ub.uni-frankfurt.de
+ 45564 www.research-collection.ethz.ch
+ 45153 dk.um.si
+ 43313 www.ssoar.info
+ 40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+ => COPY 444764
+
+ cat patch_terminal_url.2022-02-08.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-02-08.uniq.txt
+ => 426k 0:00:04 [ 103k/s]
+
+ cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 60123 www.degruyter.com
+ 59314 arxiv.org
+ 43674 zenodo.org
+ 17771 doi.org
+ 9501 linkinghub.elsevier.com
+ 9379 www.mdpi.com
+ 5691 opendata.uni-halle.de
+ 5578 scholarlypublishingcollective.org
+ 5451 era.library.ualberta.ca
+ 4982 www.cairn.info
+ 4306 www.taylorfrancis.com
+ 4189 papers.ssrn.com
+ 4157 apps.crossref.org
+ 4089 www.sciencedirect.com
+ 4033 mdpi-res.com
+ 3763 dlc.mpg.de
+ 3408 osf.io
+ 2603 www.frontiersin.org
+ 2594 watermark.silverchair.com
+ 2569 journals.lww.com
+ 1787 underline.io
+ 1680 archiviostorico.fondazione1563.it
+ 1658 www.jstage.jst.go.jp
+ 1611 cyberleninka.ru
+ 1535 www.schoeningh.de
+
+ cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+ => Done
+
+Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+ COPY (
+ -- SELECT ingest_file_result.terminal_url
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated <= '2022-02-08'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ -- ingest_request.link_source = 'oai'
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+ # COPY 3053219
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..a6f08dd
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,248 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+ cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+ https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+ https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+ https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+ https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+ https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+ https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+ https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+ https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+ https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+ https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+ https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+ https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+ https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+ https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+ https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+ https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+ https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+ https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+ https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+ https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+ => COPY 326577
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+ cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'link-loop'
+ )
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+ => COPY 342415
+
+ cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+ # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+ 30,938,106
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+ 6,664,347
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+ 8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+ ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+ # re-running 2022-02-08 after this VM was upgraded
+ # Expecting 8321448 release objects in search queries
+ # DONE
+
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
+
+## Seeds: HTML and XML links from HTML biblio
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+ | pv -l \
+ | rg '"(html|xml)_fulltext_url"' \
+ | rg '"no-pdf-link"' \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+ # cut this off at some point? gzip is terminated weird
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+ # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+ # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.xml_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+ # 1.24M 0:01:35 [12.9k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.html_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+ # 549k 0:01:27 [6.31k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | cut -f3 -d/ \
+ | sort -S 4G \
+ | uniq -c \
+ | sort -nr \
+ | head -n20
+
+ 534005 dlc.library.columbia.edu
+ 355319 www.degruyter.com
+ 196421 zenodo.org
+ 101450 serval.unil.ch
+ 100631 biblio.ugent.be
+ 47986 digi.ub.uni-heidelberg.de
+ 39187 www.emerald.com
+ 33195 www.cairn.info
+ 25703 boris.unibe.ch
+ 19516 journals.openedition.org
+ 15911 academic.oup.com
+ 11091 repository.dl.itc.u-tokyo.ac.jp
+ 9847 oxfordworldsclassics.com
+ 9698 www.thieme-connect.de
+ 9552 www.idunn.no
+ 9265 www.zora.uzh.ch
+ 8030 www.scielo.br
+ 6543 www.hanspub.org
+ 6229 asmedigitalcollection.asme.org
+ 5651 brill.com
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | awk '{print "F+ " $1}' \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+ wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+ 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.
+
+TODO: generate this list
+
+## Non-OA DOI Bulk Ingest
+
+Had previously run:
+
+ cat ingest_nonoa_doi.json.gz \
+ | rg -v "doi.org/10.2139/" \
+ | rg -v "doi.org/10.1021/" \
+ | rg -v "doi.org/10.1121/" \
+ | rg -v "doi.org/10.1515/" \
+ | rg -v "doi.org/10.1093/" \
+ | rg -v "europepmc.org" \
+ | pv -l \
+ | gzip \
+ > nonoa_doi.filtered.ingests.json.gz
+ # 7.35M 0:01:13 [99.8k/s]
+
+Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has
+entirely finished, but after almost all queues (domains) have been done for
+several days.
+
+ zcat nonoa_doi.filtered.ingests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks like many jstage `no-capture` status; these are still (slowly) crawling.
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..9722459
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,278 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+ # 9.08M 0:37:38 [4.02k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 2919808
+ html | wrong-scope | 1098998
+ pdf | no-pdf-link | 481532
+ pdf | redirect-loop | 429006
+ html | success | 342501
+ html | unknown-scope | 225390
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187762
+ html | no-capture | 185418
+ pdf | no-capture | 171273
+ pdf | null-body | 129028
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91551
+ pdf | link-loop | 25447
+ html | wrong-mimetype | 22640
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ pdf | wrong-mimetype | 7688
+ xml | success | 6897
+ html | petabox-error | 5529
+ pdf | wayback-error | 2706
+ xml | null-body | 2353
+ pdf | | 2063
+ pdf | wayback-content-error | 1349
+ html | cdx-error | 1169
+ pdf | cdx-error | 1130
+ pdf | petabox-error | 679
+ html | | 620
+ pdf | empty-blob | 562
+ html | blocked-cookie | 545
+ (30 rows)
+
+After the above load:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3036457
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108132
+ pdf | no-pdf-link | 485703
+ pdf | redirect-loop | 436085
+ html | success | 342594
+ html | unknown-scope | 225412
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187999
+ html | no-capture | 187310
+ pdf | no-capture | 172033
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91799
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6897
+ html | petabox-error | 5530
+ pdf | wayback-error | 2707
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 771
+ pdf | empty-blob | 562
+ (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+ => COPY 353819
+
+Not that many! Guess the filters are important?
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ );
+ => 3202164
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+ => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+ # COPY 350661
+
+And stats again:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3037059
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108476
+ pdf | no-pdf-link | 485705
+ pdf | redirect-loop | 436850
+ html | success | 342762
+ html | unknown-scope | 225412
+ html | redirect-loop | 224683
+ html | html-resource-no-capture | 188058
+ html | no-capture | 185734
+ pdf | no-capture | 170452
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91875
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19042
+ html | terminal-bad-status | 13333
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6898
+ html | petabox-error | 5535
+ pdf | wayback-error | 2711
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 772
+ html | blocked-cookie | 769
+ (30 rows)
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | jq -r .base_url \
+ | sort -u -S 4G \
+ > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
+
+
+## Bulk Ingest
+
+After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up.
+
+ # 2022-03-22
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+ zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+ | rg -v 'oai:kb.dk:' \
+ | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+ | rg -v 'oai:hispana.mcu.es:' \
+ | rg -v 'oai:bnf.fr:' \
+ | rg -v 'oai:ukm.si:' \
+ | rg -v 'oai:biodiversitylibrary.org:' \
+ | rg -v 'oai:hsp.org:' \
+ | rg -v 'oai:repec:' \
+ | rg -v 'oai:n/a:' \
+ | rg -v 'oai:quod.lib.umich.edu:' \
+ | rg -v 'oai:americanae.aecid.es:' \
+ | rg -v 'oai:www.irgrid.ac.cn:' \
+ | rg -v 'oai:espace.library.uq.edu:' \
+ | rg -v 'oai:edoc.mpg.de:' \
+ | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+ | rg -v 'oai:repository.erciyes.edu.tr:' \
+ | rg -v 'oai:krm.or.kr:' \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..23fd35f
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,144 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+ export PATCHDATE=2022-04-20
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+ # COPY 4842749
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v www.archive.org \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ # 4.75M 0:01:44 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 1515829 www.jstage.jst.go.jp
+ 1052953 doi.org
+ 241704 arxiv.org
+ 219543 www.sciencedirect.com
+ 178562 www.persee.fr
+ 84947 zenodo.org
+ 67397 www.mdpi.com
+ 65775 journals.lww.com
+ 58216 opg.optica.org
+ 50673 osf.io
+ 45776 www.degruyter.com
+ 36664 www.indianjournals.com
+ 35287 pubs.rsc.org
+ 33495 www.bmj.com
+ 33320 www.research-collection.ethz.ch
+ 29728 www.e-periodica.ch
+ 28338 iopscience.iop.org
+ 26364 www.cambridge.org
+ 23840 onlinelibrary.wiley.com
+ 23641 platform.almanhal.com
+ 22660 brill.com
+ 20288 www.osapublishing.org
+ 18561 cgscholar.com
+ 18539 doi.nrct.go.th
+ 15677 www.frontiersin.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+ => 4.84M 0:03:14 [24.9k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => started 2022-05-11
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..bc78998
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,278 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+ export SNAPSHOT=2022-03-09
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
+## Download and Archive
+
+ wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+ # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+ export SNAPSHOT=2022-03-09
+ ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+ # if needed
+ scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv shell
+
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+ # 34.9M 3:02:32 [3.19k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ # 34.9M 5:23:15 [1.80k/s]
+ # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- take "all time" instead of just this recent capture
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+ => COPY 6025671
+
+ # transform
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+ # 6.03M 0:03:26 [29.1k/s]
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330232
+ success | 2455102
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16078
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330055
+ success | 2455279
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16079
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ -- AND date(ingest_request.created) > '2022-04-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+ => before ingest and arxiv.org DOI exclusion: COPY 3309091
+ => COPY 3308914
+
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+ => 3.31M 0:02:22 [23.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+ 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+ 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+ 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+ 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+ scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20
+ 158497 www.scielo.br
+ 144732 onlinelibrary.wiley.com
+ 129349 www.researchsquare.com
+ 94923 hal.archives-ouvertes.fr
+ 69293 openresearchlibrary.org
+ 64584 www.cell.com
+ 60033 link.springer.com
+ 50528 www.degruyter.com
+ 49737 projecteuclid.org
+ 45841 www.jstage.jst.go.jp
+ 44819 www.mdpi.com
+ 44325 ieeexplore.ieee.org
+ 38091 dr.lib.iastate.edu
+ 31030 www.nature.com
+ 30300 discovery.ucl.ac.uk
+ 27692 ntrs.nasa.gov
+ 24215 orca.cardiff.ac.uk
+ 23653 www.frontiersin.org
+ 23474 pure.rug.nl
+ 22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 4784948 => +2,329,669 ~77%
+ redirect-loop | 485270 => + 288,153 ~10%
+ no-capture | 317598 => -3,012,457
+ terminal-bad-status | 267853 => + 185,235 ~ 6%
+ no-pdf-link | 118303 => + 85,257
+ blocked-cookie | 111373 => + 95,294
+ skip-url-blocklist | 19368
+ link-loop | 9091
+ wrong-mimetype | 7163
+ cdx-error | 2516
+ empty-blob | 1961
+ wayback-error | 1922
+ body-too-large | 509
+ petabox-error | 416
+ wayback-content-error | 341
+ bad-gzip-encoding | 281
+ | 253
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Groovy!
diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md
new file mode 100644
index 0000000..ec31a7d
--- /dev/null
+++ b/notes/ingest/2022-07-15_ingest_fixes.md
@@ -0,0 +1,831 @@
+
+## HTML `html-resource-no-capture` Fixes
+
+Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors.
+
+SQL query:
+
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100;
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100;
+
+ select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture';
+ => 210,528
+
+http://agroengineering.it/index.php/jae/article/view/568/609
+- old capture, from `20171017204935`
+- missing .css file; seems like an actual case of missing content?
+- TODO: re-crawl/re-ingest when CDX is old
+
+https://www.karger.com/Article/FullText/484130
+- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2
+- resource is live
+- this was from DOI-LANDING crawl, no resources captured
+- TODO: re-crawl
+
+https://www.mdpi.com/1996-1073/13/21/5563/htm
+- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm
+- common crawl capture; no/few resources?
+- TODO: re-crawl
+
+http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en
+- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg
+ not on live web
+- old (2013) wide crawl
+- TODO: re-crawl
+
+http://g3journal.org/lookup/doi/10.1534/g3.116.027730
+- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif
+- old 2018 landing crawl (no resources)
+- TODO: re-crawl
+
+https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full
+- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762"
+- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1
+- archiveteam crawl
+- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page
+
+https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full
+- WORKING
+
+https://doi.org/10.4000/trajectoires.2317
+- redirect: https://journals.openedition.org/trajectoires/2317
+- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces)
+- FIXED
+
+http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en
+- WORKING
+
+https://f1000research.com/articles/9-571/v2
+- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js'
+- added recaptcha.net to blocklist
+- still needs a re-crawl
+- SPN capture, from 2020, but images were missing?
+- re-capture has images (though JS still wonky)
+- TODO: re-crawl with SPN2
+
+http://bio.biologists.org/content/4/9/1163
+- DOI LANDING crawl, no sub-resources
+- TODO: recrawl
+
+http://err.ersjournals.com/content/26/145/170039.full
+- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif
+ on live web
+- 2017 targetted heritrix crawl
+- TODO: recrawl
+
+http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN
+- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg
+- recent archiveteam crawl
+- TODO: recrawl
+
+http://journals.ed.ac.uk/lithicstudies/article/view/1444
+- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081
+- common crawl
+- TODO: recrawl
+
+http://medisan.sld.cu/index.php/san/article/view/495
+- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg
+- this single resource is legit missing
+
+seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests
+
+request sources:
+- fatcat-changelog (doi)
+- fatcat-ingest (doi)
+- doaj
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'html'
+ AND ingest_file_result.status = 'html-resource-no-capture'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json';
+ => COPY 210749
+
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json
+
+Try a sample of 300:
+
+ shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Seeing a bunch of:
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"]
+
+ "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069",
+
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"]
+
+These seem to be transfer encoding issues; fixed?
+
+ ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"]
+
+Full batch:
+
+ # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Not running the full batch for now, because there are almost all `wayback-content-error` issues.
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l
+ 114935
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+
+## Redirect Loops
+
+Seems like there might have been a bug in how ingest pipeline dealt with
+multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and
+normalization was happening.
+
+This could be a really big deal because we have over 11 million such ingest
+requests! and may even have stopped crawling domains on the basis of redirect
+looping.
+
+ select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50;
+
+http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf
+- 'skip-url-blocklist'
+- paywall on live web
+
+http://www.redjournal.org/article/S0360301616308276/pdf
+- redirect to 'secure.jbs.elsevierhealth.com'
+- ... but re-crawling with SPNv2 worked
+- TODO: reingest this entire journal with SPNv2
+
+http://www.jmirs.org/article/S1939865415001551/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+http://www.cell.com/article/S0006349510026147/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- TODO: try SPNv2?
+- RECRAWL: success
+
+http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf
+- FIXED: success
+
+http://www.nature.com/articles/hdy1994143.pdf
+- blocked-cookie (idp.nature.com / cookies_not_supported)
+- RECRAWL: gateway-timeout
+
+http://www.thelancet.com/article/S0140673619327606/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf
+- FIXED: success
+
+http://hdl.handle.net/21.11116/0000-0001-B1A2-F
+- FIXED: success
+
+http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf
+- FIXED: success
+
+http://www.jto.org/article/S1556086416329999/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+http://www.jahonline.org/article/S1054139X16303020/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+So, wow wow wow, a few things to do here:
+
+- just re-try all these redirect-loop attempts to update status
+- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time!
+
+Possibly the elsevierhealth stuff will require some deeper fiddling to crawl
+correctly.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'redirect-loop'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json';
+ => COPY 6611342
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json
+
+Start with a sample:
+
+ shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Wow that is a lot of ingest! And a healthy fraction of 'success', almost all
+via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full
+batch:
+
+ cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+TODO: repeat with broader query (eg, OAI-PMH, MAG, etc).
+
+## Other
+
+Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\""
+
+ https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322
+ https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322
+
+Fixed!
+
+
+## Broken WARC Record?
+
+cdx line:
+
+ net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz
+
+download WARC and run:
+
+ zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20
+
+the WARC record:
+
+ WARC/1.0
+ WARC-Type: revisit
+ WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js
+ WARC-Date: 2022-07-16T08:40:26Z
+ WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB
+ WARC-IP-Address: 13.227.21.220
+ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+ WARC-Truncated: length
+ WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0>
+ Content-Type: application/http; msgtype=response
+ Content-Length: 493
+
+ HTTP/1.1 200 OK
+ Content-Type: application/javascript
+ Content-Length: 512
+ Connection: close
+ Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT
+ Accept-Ranges: bytes
+ Server: AmazonS3
+ Date: Fri, 15 Jul 2022 16:36:08 GMT
+ ETag: "1c28db48d4012f0221b63224a3bb7137"
+ Vary: Accept-Encoding
+ X-Cache: Hit from cloudfront
+ Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront)
+ X-Amz-Cf-Pop: SFO20-C1
+ X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg==
+ Age: 57859
+
+where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines?
+
+## osf.io
+
+ select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30;
+
+ status | terminal_status_code | count
+ -------------------------+----------------------+-------
+ terminal-bad-status | 404 | 92110
+ no-pdf-link | 200 | 46932
+ not-found | 200 | 20212
+ no-capture | | 8599
+ success | 200 | 7604
+ redirect-loop | 301 | 2125
+ terminal-bad-status | 503 | 1657
+ cdx-error | | 1301
+ wrong-mimetype | 200 | 901
+ terminal-bad-status | 410 | 364
+ read-timeout | | 167
+ wayback-error | | 142
+ gateway-timeout | | 139
+ terminal-bad-status | 500 | 76
+ spn2-error | | 63
+ spn2-backoff | | 42
+ petabox-error | | 39
+ spn2-backoff | 200 | 27
+ redirect-loop | 302 | 19
+ terminal-bad-status | 400 | 15
+ terminal-bad-status | 401 | 15
+ remote-server-error | | 14
+ timeout | | 11
+ terminal-bad-status | | 11
+ petabox-error | 200 | 10
+ empty-blob | 200 | 8
+ null-body | 200 | 6
+ spn2-error:unknown | | 5
+ redirect-loop | 308 | 4
+ spn2-cdx-lookup-failure | | 4
+ (30 rows)
+
+Many of these are now non-existant, or datasets/registrations not articles.
+Hrm.
+
+
+## Large DOAJ no-pdf-link Domains
+
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain,
+ COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_request.base_url = ingest_file_result.base_url
+ WHERE
+ ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source = 'doaj'
+ GROUP BY
+ domain
+ ORDER BY
+ COUNT(*) DESC
+ LIMIT 50;
+
+ domain | count
+ -------------------------------------------------------+--------
+ www.sciencedirect.com | 211090
+ auth.openedition.org | 20741
+ journal.frontiersin.org:80 | 11368
+ journal.frontiersin.org | 6494
+ ejde.math.txstate.edu | 4301
+ www.arkat-usa.org | 4001
+ www.scielo.br | 3736
+ www.lcgdbzz.org | 2892
+ revistas.uniandes.edu.co | 2715
+ scielo.sld.cu | 2612
+ www.egms.de | 2488
+ journals.lww.com | 2415
+ ter-arkhiv.ru | 2239
+ www.kitlv-journals.nl | 2076
+ www.degruyter.com | 2061
+ jwcn-eurasipjournals.springeropen.com | 1929
+ www.cjcnn.org | 1908
+ www.aimspress.com | 1885
+ vsp.spr-journal.ru | 1873
+ dx.doi.org | 1648
+ www.dlib.si | 1582
+ aprendeenlinea.udea.edu.co | 1548
+ www.math.u-szeged.hu | 1448
+ dergipark.org.tr | 1444
+ revistas.uexternado.edu.co | 1429
+ learning-analytics.info | 1419
+ drive.google.com | 1399
+ www.scielo.cl | 1326
+ www.economics-ejournal.org | 1267
+ www.jssm.org | 1240
+ html.rhhz.net | 1232
+ journalofinequalitiesandapplications.springeropen.com | 1214
+ revistamedicina.net | 1197
+ filclass.ru | 1154
+ ceramicayvidrio.revistas.csic.es | 1152
+ gynecology.orscience.ru | 1126
+ www.tobaccoinduceddiseases.org | 1090
+ www.tandfonline.com | 1046
+ www.querelles-net.de | 1038
+ www.swjpcc.com | 1032
+ microbiologyjournal.org | 1028
+ revistas.usal.es | 1027
+ www.medwave.cl | 1023
+ ijtech.eng.ui.ac.id | 1023
+ www.scielo.sa.cr | 1021
+ vestnik.szd.si | 986
+ www.biomedcentral.com:80 | 984
+ scielo.isciii.es | 983
+ bid.ub.edu | 970
+ www.meirongtv.com | 959
+ (50 rows)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5;
+ http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html
+ # plain HTML, not really parse-able
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5;
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216
+ # fixed (embed PDF)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5;
+ https://doi.org/10.5935/0034-7280.20200075
+ https://doi.org/10.5935/0004-2749.20200071
+ https://doi.org/10.5935/0034-7280.20200035
+ http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014
+ https://doi.org/10.5935/0034-7280.20200047
+ # need recrawls?
+ # then success
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5;
+ https://doi.org/10.3205/16dgnc020
+ http://nbn-resolving.de/urn:nbn:de:0183-19degam1126
+ http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml
+ http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml
+ http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625
+ # mostly abstracts, don't have PDF versions
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5;
+ https://doi.org/10.26442/terarkh201890114-47
+ https://doi.org/10.26442/00403660.2019.12.000206
+ https://journals.eco-vector.com/0040-3660/article/download/32246/pdf
+ https://journals.eco-vector.com/0040-3660/article/download/33578/pdf
+ https://doi.org/10.26442/00403660.2019.12.000163
+ # working, needed recrawls (some force re-crawls)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5;
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5;
+ https://srl.si/ojs/srl/article/view/2910
+ https://srl.si/ojs/srl/article/view/3640
+ https://srl.si/ojs/srl/article/view/2746
+ https://srl.si/ojs/srl/article/view/2557
+ https://srl.si/ojs/srl/article/view/2583
+ # fixed? (dlib.si)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5;
+ http://www.jssm.org/vol4/n4/8/v4n4-8text.php
+ http://www.jssm.org/vol7/n1/19/v7n1-19text.php
+ http://www.jssm.org/vol9/n3/10/v9n3-10text.php
+ http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml
+ http://www.jssm.org/vol7/n2/11/v7n2-11text.php
+ # works as an HTML document? otherwise hard to select on PDF link
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5;
+ https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism
+ https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay
+ https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach
+ https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad
+ https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre
+ # fixed
+ # TODO: XXX: re-crawl/ingest
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5;
+ https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/
+ https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/
+ https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/
+ https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/
+ https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/
+ # HTML article, no PDF
+ # ... but only sometimes
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5;
+ http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878
+ https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act
+ http://dx.doi.org/10.5867/medwave.2012.03.5332
+ https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act
+ http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964
+ # HTML article, no PDF
+
+Re-ingest HTML:
+
+ https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE)
+ https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE)
+
+Re-ingest PDF:
+
+ doi_prefix:10.5935 (DONE)
+ doi_prefix:10.26442
+
+## More Scielo
+
+More scielo? `doi_prefix:10.5935 in_ia:false`
+
+ http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873
+ # OJS? fixed
+
+ https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240
+ # working, but needed re-crawl
+
+ http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft
+
+A few others, mostly now working
+
+## Recent OA DOIs
+
+ fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json
+
+ wc -l recent_missing_oa.json
+ 24433
+
+ cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head
+ 4968 10.3390
+ 1261 10.1080
+ 687 10.23668
+ 663 10.1021
+ 472 10.1088
+ 468 10.4000
+ 367 10.3917
+ 357 10.1364
+ 308 10.4230
+ 303 10.17863
+
+ cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr
+ 19496 crossref
+ 4836 datacite
+ 101 null
+
+ cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr
+ 9575 longtail
+ 8419 null
+ 3861 society
+ 822 unipress
+ 449 oa
+ 448 scielo
+ 430 commercial
+ 400 repository
+ 22 other
+ 7 archive
+
+ cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head
+ 4871 MDPI AG
+ 1107 Informa UK (Taylor & Francis)
+ 665 EAG-Publikationen
+ 631 American Chemical Society
+ 451 IOP Publishing
+ 357 The Optical Society
+ 347 OpenEdition
+ 309 CAIRN
+ 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik
+ 303 Apollo - University of Cambridge Repository
+
+ cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head
+ 4908 null
+ 378 Sustainability
+ 327 ACS Omega
+ 289 Optics Express
+ 271 International Journal of Environmental Research and Public Health
+ 270 International Journal of Health Sciences
+ 238 Sensors
+ 223 International Journal of Molecular Sciences
+ 207 Molecules
+ 193 Proceedings of the National Academy of Sciences of the United States of America
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | wc -l
+ 16558
+
+ cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r
+ 10.3390/molecules27144419
+ => was a 404
+ => recrawl was successful
+ 10.3390/math10142398
+ => was a 404
+ 10.3390/smartcities5030039
+ => was a 404
+
+Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation.
+Could be just a fatcat script, or a sandcrawler query.
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | shuf -n10 | jq .doi -r
+
+ https://doi.org/10.18452/24860
+ => success (just needed quarterly retry?)
+ => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki)
+ => current status is "bad-redirect"
+ https://doi.org/10.26181/20099540.v1
+ => success
+ => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30
+ => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540
+ https://doi.org/10.4230/lipics.sea.2022.22
+ => there is a bug resulting in trailing slash in `citation_pdf_url`
+ => fixed as a quirks mode
+ => emailed to report
+ https://doi.org/10.3897/aca.5.e89679
+ => success
+ => e6fd1e066c8a323dc56246631748202d5fb48808
+ => current status is 'bad-redirect'
+ https://doi.org/10.1103/physrevd.105.115035
+ => was 404
+ => success after force-recrawl of the terminal URL (not base URL)
+ https://doi.org/10.1155/2022/4649660
+ => was 404
+ => success after force-recrawl (of base_url)
+ https://doi.org/10.1090/spmj/1719
+ => paywall (not actually OA)
+ => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA?
+ https://doi.org/10.1139/as-2022-0011
+ => was no-pdf-link
+ => fixed fulltext URL extraction
+ => still needed to re-crawl terminal PDF link? hrm
+ https://doi.org/10.31703/grr.2022(vii-ii).02
+ => was no-pdf-link
+ => fixed! success
+ https://doi.org/10.1128/spectrum.00154-22
+ => was 404
+ => now repeatably 503, via SPN
+ https://doi.org/10.51601/ijersc.v3i3.393
+ => 503 server error
+ https://doi.org/10.25416/ntr.20137379.v1
+ => is figshare
+ => docx (not PDF)
+ https://doi.org/10.25394/pgs.20263698.v1
+ => figshare
+ => embargo'd
+ https://doi.org/10.24850/j-tyca-14-4-7
+ => was no-pdf-link
+ => docs.google.com/viewer (!)
+ => now handle this (success)
+ https://doi.org/10.26267/unipi_dione/1832
+ => was bad-redirect
+ => success
+ https://doi.org/10.25560/98019
+ => body-too-large
+ => also, PDF metadata fails to parse
+ => is actually like 388 MByte
+ https://doi.org/10.14738/abr.106.12511
+ => max-hops-exceeded
+ => bumped max-hops from 6 to 8
+ => then success (via google drive)
+ https://doi.org/10.24350/cirm.v.19933803
+ => video, not PDF
+ https://doi.org/10.2140/pjm.2022.317.67
+ => link-loop
+ => not actually OA
+ https://doi.org/10.26265/polynoe-2306
+ => was bad-redirect
+ => now success
+ https://doi.org/10.3389/fpls.2022.826875
+ => frontiers
+ => was terminal-bad-status (403)
+ => success on retry (not sure why)
+ => maybe this is also a date-of-publication thing?
+ => not sure all these should be retried though
+ https://doi.org/10.14198/medcom.22240
+ => was terminal-bad-status (404)
+ => force-recrawl resulted in an actual landing page, but still no-pdf-link
+ => but actual PDF is a real 404, it seems. oh well
+ https://doi.org/10.31729/jnma.7579
+ => no-capture
+ https://doi.org/10.25373/ctsnet.20146931.v2
+ => figshare
+ => video, not document or PDF
+ https://doi.org/10.1007/s42600-022-00224-0
+ => not yet crawled/attempted (!)
+ => springer
+ => not actually OA
+ https://doi.org/10.37391/ijeer.100207
+ => some upstream issue (server not found)
+ https://doi.org/10.1063/5.0093946
+ => aip.scitation.org, is actually OA (can download in browser)
+ => cookie trap?
+ => redirect-loop (seems like a true redirect loop)
+ => retrying the terminal PDF URL seems to have worked
+ https://doi.org/10.18502/jchr.v11i2.9998
+ => no actual fulltext on publisher site
+ https://doi.org/10.1128/spectrum.01144-22
+ => this is a 503 error, even after retrying. weird!
+
+DONE: check `publisher_type` in chocula for:
+- "MDPI AG"
+- "Informa UK (Taylor & Francis)"
+
+ cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40
+ 4819 ["MDPI AG","longtail"]
+ 924 ["Informa UK (Taylor & Francis)",null]
+ 665 ["EAG-Publikationen",null]
+ 631 ["American Chemical Society","society"]
+ 449 ["IOP Publishing","society"]
+ 357 ["The Optical Society","society"]
+ 336 ["OpenEdition","oa"]
+ 309 ["CAIRN","repository"]
+ 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null]
+ 303 ["Apollo - University of Cambridge Repository",null]
+ 292 ["Springer (Biomed Central Ltd.)",null]
+ 275 ["Purdue University Graduate School",null]
+ 270 ["Suryasa and Sons","longtail"]
+ 257 ["La Trobe",null]
+ 216 ["Frontiers Media SA","longtail"]
+ 193 ["Proceedings of the National Academy of Sciences","society"]
+ 182 ["Informa UK (Taylor & Francis)","longtail"]
+ 176 ["American Physical Society","society"]
+ 168 ["Institution of Electrical Engineers","society"]
+ 166 ["Oxford University Press","unipress"]
+ 153 ["Loughborough University",null]
+
+ chocula mostly seems to set these correctly. is the issue that the chocula
+ computed values aren't coming through or getting updated? probably. both
+ the release (from container) metadata update; and chocula importer not
+ doing updates based on this field; and some old/incorrect values.
+
+ did some cleanups of specific containers, and next chocula update should
+ result in a bunch more `publisher_type` getting populated on older
+ containers
+
+
+TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline
+
+TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?)
+ doi_prefix:10.26181
+
+WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?)
+ doi_prefix:10.3390 (MDPI)
+ doi_prefix:10.1103
+ doi_prefix:10.1155
+
+DONE: simply re-ingest all:
+ doi_prefix:10.4230
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230'
+ # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096})
+ container_65lzi3vohrat5nnymk3dqpoycy
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy
+ # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187})
+ container_5vp2bio65jdc3blx6rfhp3chde
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde
+ # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83})
+
+DONE: verify and maybe re-ingest all:
+ is_oa:true publisher:"Canadian Science Publishing" in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print'
+ # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041})
+
+
+## Re-Ingest bad-redirect, max-hops-exceeded, and google drive
+
+Similar to `redirect-loop`:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'bad-redirect'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json';
+ # COPY 100011
+ # after first run: COPY 5611
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'max-hops-exceeded'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json';
+ # COPY 3546
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.hit is false
+ AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json';
+ # COPY 1082
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json
+
+ cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ # DONE
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'dblp'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-----------------------+--------
+ pdf | success | 305142
+ pdf | no-pdf-link | 192683
+ pdf | no-capture | 42634
+ pdf | terminal-bad-status | 38041
+ pdf | skip-url-blocklist | 31055
+ pdf | link-loop | 9263
+ pdf | wrong-mimetype | 4545
+ pdf | redirect-loop | 3952
+ pdf | empty-blob | 2705
+ pdf | wayback-content-error | 834
+ pdf | wayback-error | 294
+ pdf | petabox-error | 202
+ pdf | blocked-cookie | 155
+ pdf | cdx-error | 115
+ pdf | body-too-large | 66
+ pdf | bad-redirect | 19
+ pdf | timeout | 7
+ pdf | bad-gzip-encoding | 4
+ (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+ export PATCHDATE=2022-07-29
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+ => COPY 3524573
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ => 3.11M 0:01:08 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 624948 doi.org
+ 382492 www.jstage.jst.go.jp
+ 275087 www.mdpi.com
+ 157134 www.persee.fr
+ 108979 www.sciencedirect.com
+ 94375 www.scielo.br
+ 50834 onlinelibrary.wiley.com
+ 49991 journals.lww.com
+ 30354 www.frontiersin.org
+ 27963 doaj.org
+ 27058 www.e-periodica.ch
+ 24147 dl.acm.org
+ 23389 aclanthology.org
+ 22086 www.research-collection.ethz.ch
+ 21589 medien.die-bonn.de
+ 18866 www.ingentaconnect.com
+ 18583 doi.nrct.go.th
+ 18271 repositories.lib.utexas.edu
+ 17634 hdl.handle.net
+ 16366 archives.datapages.com
+ 15146 cgscholar.com
+ 13987 dl.gi.de
+ 13188 www.degruyter.com
+ 12503 ethos.bl.uk
+ 12304 preprints.jmir.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+ => done
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+ => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+ cd /srv/sandcrawler/tasks/
+ wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+ cd /srv/sandcrawler/src/python
+ git log | head -n1
+ # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+ pipenv shell
+ zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+ # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+ head oai_type_counts.txt -n30
+ 5623867 info:eu-repo/semantics/article
+ 5334928 info:eu-repo/semantics/publishedVersion
+ 3870359 text
+ 1240225 Text
+ 829169 Article
+ 769849 NonPeerReviewed
+ 665700 PeerReviewed
+ 648740 Peer-reviewed Article
+ 547857 article
+ 482906 info:eu-repo/semantics/bachelorThesis
+ 353814 Thesis
+ 329269 Student thesis
+ 262650 info:eu-repo/semantics/conferenceObject
+ 185354 Journal articles
+ 162021 info:eu-repo/semantics/doctoralThesis
+ 152079 Journal Article
+ 150226 Research Article
+ 130217 Conference papers
+ 127255 Artículo revisado por pares
+ 124243 Newspaper
+ 123908 ##rt.metadata.pkp.peerReviewed##
+ 123309 Photograph
+ 122981 info:eu-repo/semantics/masterThesis
+ 116719 Book
+ 108946 Image
+ 108216 Report
+ 107946 Other
+ 103562 masterThesis
+ 103038 info:eu-repo/semantics/other
+ 101404 StillImage
+ [...]
+
+And formats:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+ head -n 20 oai_format_counts.txt
+ 11151928 application/pdf
+ 677413 text
+ 561656 text/html
+ 498518 image/jpeg
+ 231219 Text
+ 193638 text/xml
+ 147214 Image
+ 117073 image/jpg
+ 110872 pdf
+ 91323 image/tiff
+ 76948 bib
+ 75393 application/xml
+ 70244 Digitized from 35 mm. microfilm.
+ 68206 mods
+ 59227 PDF
+ 57677 application/epub+zip
+ 57602 application/octet-stream
+ 52072 text/plain
+ 51620 application/msword
+ 47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+ # 20,840,301
+
+Next load in to sandcrawler DB:
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+ Traceback (most recent call last):
+ File "./persist_tool.py", line 311, in <module>
+ main()
+ File "./persist_tool.py", line 307, in main
+ args.func(args)
+ File "./persist_tool.py", line 119, in run_ingest_request
+ pusher.run()
+ File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+ self.worker.push_batch(batch)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+ cur.execute(b''.join(parts))
+ psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+ DETAIL: Index row references tuple (6893121,3) in relation "ingest_request".
+ HINT: Values larger than 1/3 of a buffer page cannot be indexed.
+ Consider a function index of an MD5 hash of the value, or use full text indexing.
+ 15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+ | tac \
+ | head -n1000000 \
+ | pv -l \
+ | ./persist_tool.py ingest-request -
+ # 1.00M 0:03:04 [5.41k/s]
+ # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+---------
+ | 6398455
+ success | 540219
+ no-pdf-link | 41316
+ link-loop | 23871
+ no-capture | 11350
+ redirect-loop | 8315
+ wrong-mimetype | 2394
+ terminal-bad-status | 1540
+ null-body | 1038
+ cdx-error | 272
+ empty-blob | 237
+ petabox-error | 213
+ wayback-error | 186
+ blocked-cookie | 107
+ timeout | 47
+ wayback-content-error | 26
+ spn2-cdx-lookup-failure | 21
+ skip-url-blocklist | 16
+ spn2-backoff | 15
+ body-too-large | 13
+ (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+ # COPY 6398455
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+ # 6.40M 0:02:18 [46.2k/s]
+
+ cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3617175
+ success | 2775036
+ no-pdf-link | 449298
+ link-loop | 74260
+ terminal-bad-status | 47819
+ wrong-mimetype | 20195
+ redirect-loop | 18197
+ empty-blob | 12127
+ cdx-error | 3038
+ skip-url-blocklist | 2630
+ wayback-error | 2599
+ petabox-error | 2354
+ wayback-content-error | 1617
+ blocked-cookie | 1293
+ null-body | 1038
+ body-too-large | 670
+ | 143
+ bad-gzip-encoding | 64
+ timeout | 47
+ spn2-cdx-lookup-failure | 20
+ (20 rows)
+
+
+## Crawl Seedlist
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'terminal-bad-status'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'timeout'
+ OR ingest_file_result.status = 'wayback-content-error'
+ )
+ ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+ => COPY 3692846
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+ => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | jq .base_url -r \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ => 3.66M 0:00:59 [61.8k/s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | rg '"terminal_url"' \
+ | jq -r .result.terminal_url \
+ | rg -v ^null$ \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ => 0.00 0:00:05 [0.00 /s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | awk '{print "F+ " $1}' \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | sort -u -S 4G \
+ | cut -d/ -f3 \
+ | sort \
+ | uniq -c \
+ | sort -nr \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+ head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+ 91899 raco.cat
+ 70116 islandora.wrlc.org
+ 68708 urn.kb.se
+ 63726 citeseerx.ist.psu.edu
+ 50370 publications.rwth-aachen.de
+ 44885 urn.nsk.hr
+ 38429 server15795.contentdm.oclc.org
+ 33041 periodicos.ufpb.br
+ 32519 nbn-resolving.org
+ 31990 www.ajol.info
+ 24745 hal.archives-ouvertes.fr
+ 22569 id.nii.ac.jp
+ 17239 tilburguniversity.on.worldcat.org
+ 15873 dspace.nbuv.gov.ua
+ 15436 digitalcommons.wustl.edu
+ 14885 www.iiste.org
+ 14623 www.manchester.ac.uk
+ 14033 nbn-resolving.de
+ 13999 opus4.kobv.de
+ 13689 www.redalyc.org
+
+Sizes:
+
+ wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+ # as regular user
+ scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+
diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md
new file mode 100644
index 0000000..8cdd6df
--- /dev/null
+++ b/notes/ingest/NEXT.md
@@ -0,0 +1,52 @@
+
+biorxiv
+medrxiv
+ doi:10.1101\/20*
+
+persee.fr 147k
+ publisher:persee in_ia:false is_oa:true
+ https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013
+
+cairn.info: 161k
+ doi_prefix:10.3917 in_ia:false is_oa:true
+ https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm
+ https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm
+
+IOP OA: 169k
+ doi_prefix:10.1088 is_oa:true in_ia:false
+
+indian journals platform? 124k
+ doi_prefix:10.4103 in_ia:false is_oa:true
+ http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad
+ http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki
+
+openedition? 48k
+ doi_prefix:10.4000 is_oa:true in_ia:false
+
+german medical science (GMS) 28k
+ doi_prefix:10.3205 in_ia:false is_oa:true
+ https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml
+
+siberian chemistry 28k
+ doi_prefix:10.2298 in_ia:false is_oa:true
+ http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H
+
+jalc oa doi: 82k
+ doi_registrar:jalc in_ia:false is_oa:true
+
+sage OA papers
+ https://journals.sagepub.com/doi/10.1177/034003529802400510
+
+Scientific Reports: 25k
+ in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm"
+
+U Toronto press: 23k
+ publisher:"Toronto Press" in_ia:false is_oa:true
+ has an annoying bounce page
+
+ASHA (speech-language-hearing association): 7k
+ publisher:Speech-Language-Hearing in_ia:false is_oa:true
+
+MIT press journals
+
+
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+Status overview for a particular domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+ FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ AND t1.terminal_status_code is not null
+ GROUP BY domain, terminal_status_code
+ ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+ this publisher (The Optical Society) is systemically using a CAPTCHA to
+ gate access to PDFs. bummer! could ask them to white-list?
+
+ has citation_pdf_url, so that isn't an issue
+
+ status: "no-pdf-link"
+ hops:
+ "https://doi.org/10.1364/optica.6.000798",
+ "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+ "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+ domain | status | count
+ -----------------------+---------------------+-------
+ www.osapublishing.org | no-capture | 16680
+ www.osapublishing.org | no-pdf-link | 373
+ www.osapublishing.org | redirect-loop | 19
+ www.osapublishing.org | terminal-bad-status | 5
+ www.osapublishing.org | cdx-error | 1
+ www.osapublishing.org | wrong-mimetype | 1
+ www.osapublishing.org | spn-error | 1
+ www.osapublishing.org | success | 1
+ www.osapublishing.org | wayback-error | 1
+ (9 rows)
+
+www.persee.fr
+
+ Seems to be mostly blocking or rate-limiting?
+
+ domain | status | count
+ ---------------+-------------------------------------+-------
+ www.persee.fr | no-capture | 37862
+ www.persee.fr | terminal-bad-status | 3134
+ www.persee.fr | gateway-timeout | 2828
+ www.persee.fr | no-pdf-link | 431
+ www.persee.fr | spn-error | 75
+ www.persee.fr | redirect-loop | 23
+ www.persee.fr | success | 8
+ www.persee.fr | spn2-error | 2
+ www.persee.fr | spn2-error:soft-time-limit-exceeded | 1
+ www.persee.fr | wrong-mimetype | 1
+ (10 rows)
+
+journals.openedition.org
+
+ PDF access is via "freemium" subscription. Get redirects to:
+
+ https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+ Content is technically open access (HTML and license; for all content?),
+ but can't be crawled as PDF without subscription.
+
+ domain | status | count
+ --------------------------+-------------------------+-------
+ journals.openedition.org | redirect-loop | 29587
+ journals.openedition.org | success | 6821
+ journals.openedition.org | no-pdf-link | 1507
+ journals.openedition.org | no-capture | 412
+ journals.openedition.org | wayback-error | 32
+ journals.openedition.org | wrong-mimetype | 27
+ journals.openedition.org | terminal-bad-status | 13
+ journals.openedition.org | spn2-cdx-lookup-failure | 4
+ journals.openedition.org | spn-remote-error | 1
+ journals.openedition.org | null-body | 1
+ journals.openedition.org | cdx-error | 1
+ (11 rows)
+
+journals.lww.com
+
+ no-pdf-link
+
+ domain | status | count
+ ------------------+----------------+-------
+ journals.lww.com | no-pdf-link | 11668
+ journals.lww.com | wrong-mimetype | 131
+ (2 rows)
+
+ doi prefix: 10.1097
+
+ <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+ data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+ Some weird thing going on, maybe they are blocking-via-redirect based on
+ our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+ no-pdf-link
+
+koreascience.or.kr | no-pdf-link | 8867
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+www.cairn.info | link-loop | 8717
+
+easy.dans.knaw.nl | no-pdf-link | 8262
+scielo.conicyt.cl | no-pdf-link | 7925
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'scielo.conicyt.cl'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+ domain | status | count
+ -------------------+---------------------+-------
+ scielo.conicyt.cl | no-pdf-link | 7926
+ scielo.conicyt.cl | success | 4972
+ scielo.conicyt.cl | terminal-bad-status | 1474
+ scielo.conicyt.cl | wrong-mimetype | 6
+ scielo.conicyt.cl | no-capture | 4
+ scielo.conicyt.cl | null-body | 1
+
+
+ pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 |
+ pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 |
+ pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 |
+ pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 |
+
+ These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+ pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 |
+ pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 |
+ pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 |
+
+ Look like web/xml only.
+
+ TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr | no-pdf-link | 6842
+www.m-hikari.com | no-pdf-link | 6763
+cshprotocols.cshlp.org | no-pdf-link | 6553
+www.bibliotekevirtual.org | no-pdf-link | 6309
+data.hpc.imperial.ac.uk | no-pdf-link | 6071
+projecteuclid.org | link-loop | 5970
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'projecteuclid.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%projecteuclid.org%'
+ AND status = 'link-loop'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ projecteuclid.org | link-loop | 5985
+ projecteuclid.org | success | 26
+ projecteuclid.org | wayback-error | 26
+ projecteuclid.org | wrong-mimetype | 17
+ projecteuclid.org | spn2-cdx-lookup-failure | 4
+ projecteuclid.org | other-mimetype | 4
+ projecteuclid.org | no-capture | 3
+ projecteuclid.org | terminal-bad-status | 2
+ projecteuclid.org | spn2-error:job-failed | 1
+ projecteuclid.org | spn-remote-error | 1
+ (10 rows)
+
+ Doing a cookie check and redirect.
+
+ TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br | no-pdf-link | 5823
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.scielo.br'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.scielo.br%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------+-------------------------+-------
+ www.scielo.br | success | 35150
+ www.scielo.br | no-pdf-link | 5839
+ www.scielo.br | terminal-bad-status | 429
+ www.scielo.br | no-capture | 189
+ www.scielo.br | wrong-mimetype | 7
+ www.scielo.br | spn2-cdx-lookup-failure | 2
+ (6 rows)
+
+ Seems to just be the subset with no PDFs.
+
+get.iedadata.org | no-pdf-link | 5822
+www.pdcnet.org | no-pdf-link | 5798
+publications.rwth-aachen.de | no-pdf-link | 5323
+www.sciencedomain.org | no-pdf-link | 5231
+medicalforum.ch | terminal-bad-status | 4574
+jrnl.nau.edu.ua | link-loop | 4145
+ojs.academypublisher.com | no-pdf-link | 4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop | 240967
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop | 27328
+ => blocking is pretty aggressive, using cookies or referrer or something.
+ maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md
new file mode 100644
index 0000000..54db92e
--- /dev/null
+++ b/notes/tasks/2020-01-27_cleanup_cdx.md
@@ -0,0 +1,34 @@
+
+Accidentally seem to have backfilled many CDX lines with non-PDF content.
+Should clear these out!
+
+Something like:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+Or maybe instead:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+SQL:
+
+ SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5;
+ SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01';
+ => 24841846
+
+ SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5;
+ SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL;
+ => 24547552
+
+ DELETE FROM cdx
+ WHERE sha1hex IN
+ (SELECT cdx.sha1hex
+ FROM cdx
+ LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex
+ WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL);
+ => DELETE 24553428
+
+Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in
+the DELETE WHERE clause.
diff --git a/notes/tasks/2020-02-14_pdftrio.md b/notes/tasks/2020-02-14_pdftrio.md
new file mode 100644
index 0000000..e6f8d8e
--- /dev/null
+++ b/notes/tasks/2020-02-14_pdftrio.md
@@ -0,0 +1,162 @@
+
+First end-to-end `pdf_trio` results!
+
+## Source
+
+Will use AIT partner #1830 (U Alberta) CDX as input. These are unique by
+digest, about 100k.
+
+ ArchiveIt-Collection-1830.download.cdx
+
+## Testing/Prep
+
+Versions/setup:
+
+ sandcrawler: f613f69a40fcc9a445f21cadd35d7c36c8061db8
+ => patched to 'auto' mode
+
+ pdf_trio: 03bd3fdc15418462b2b1582e4f967f26ddcb43e2
+
+ pdftrio: 'auto' mode
+
+ uwsgi: 16x processes
+
+ sudo docker run --rm -p 8501:8501 -e TF_XLA_FLAGS=--tf_xla_cpu_global_jit -e KMP_AFFINITY=granularity=fine,compact,1,0 -e KMP_BLOCKTIME=0 -e OMP_NUM_THREADS=24 -e TENSORFLOW_INTER_OP_PARALLELISM=1 -e TENSORFLOW_INTRA_OP_PARALLELISM=24 -v /srv/pdftrio//models/bert_models:/models/bert_model -v /srv/pdftrio//models/pdf_image_classifier_model:/models/image_model -v /srv/pdftrio//config/tfserving_models_docker.config:/models/tfserving_models.config -v /srv/pdftrio/config/tfserving_batch.config:/models/tfserving_batch.config --name pdftrio-tfserving tensorflow/serving --model_config_file=/models/tfserving_models.config --enable_batching=true --batching_parameters_file=/models/tfserving_batch.config
+
+Basic testing::
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - | jq .
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+ => Running in kafka output mode, publishing to sandcrawler-qa.pdftrio-output
+
+
+On the persist side:
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-qa.pdftrio-output | head | jq .
+ => looks fine
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env qa persist-pdftrio
+ => Consuming from kafka topic sandcrawler-qa.pdftrio-output, group persist-pdftrio
+
+Ah, don't forget, start persist before writing to topic! Or would need to reset
+offsets to start.
+
+Seems to be only a single pdftext instance running? Very low CPU
+
+ head -n500 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+That is much better! CPU still not pegged, so maybe could do 50x processes? Lots of I/O wait. Blech.
+
+Zero ("0") not getting persisted for any columns (fixed in sandcrawler/db.py)
+
+`models_date` not getting set. Added `PDFTRIO_MODELS_DATE="2020-01-01"` to env. (TODO: ansible)
+
+## Prod Run
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env prod persist-pdftrio
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Worker CPU basically blocked on pdftotext, multiple 100% CPU. Presumably I/O
+wait? Though not totally sure.
+
+htop:
+
+ PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command
+ 17951 pdftrio 20 0 51756 12868 5856 R 90.1 0.0 0:06.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 17870 pdftrio 20 0 52004 12964 5684 R 87.4 0.0 0:08.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 13735 root 20 0 10.4G 3815M 4144 S 79.6 7.6 48h02:37 tensorflow_model_server --port=8500 --rest_api_port=850
+ 14522 pdftrio 20 0 2817M 1331M 16896 R 43.1 2.6 0:57.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 18027 pdftrio 20 0 49192 10692 6116 R 39.8 0.0 0:00.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 14518 pdftrio 20 0 2818M 1336M 16836 S 33.3 2.7 0:47.46 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14504 pdftrio 20 0 2731M 1310M 13164 D 32.6 2.6 0:34.81 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14526 pdftrio 20 0 2816M 1333M 16832 R 28.7 2.7 0:57.22 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14500 pdftrio 20 0 2729M 1306M 13160 R 20.9 2.6 0:22.57 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14492 pdftrio 20 0 2729M 1307M 13156 S 17.6 2.6 0:17.91 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14508 pdftrio 20 0 2734M 1312M 14380 D 14.4 2.6 0:38.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14496 pdftrio 20 0 2728M 1300M 13160 S 13.7 2.6 0:18.00 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17314 sandcrawl 20 0 56668 18228 4304 D 13.7 0.0 0:02.31 perl /usr/bin/parallel -j40 -N1 --pipe --round-robin --
+ 14472 pdftrio 20 0 2725M 1283M 13136 S 12.4 2.6 0:05.69 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14513 pdftrio 20 0 2730M 1309M 14300 S 11.1 2.6 0:40.32 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14480 pdftrio 20 0 2725M 1291M 13144 S 10.4 2.6 0:08.77 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14488 pdftrio 20 0 2725M 1294M 13152 S 9.8 2.6 0:08.18 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14468 pdftrio 20 0 2717M 1271M 13088 S 6.5 2.5 0:02.42 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17411 sandcrawl 20 0 556M 53840 14936 S 6.5 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 14530 pdftrio 20 0 2524M 1252M 3492 S 4.6 2.5 0:12.72 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 7311 bnewbold 20 0 27716 5520 3128 R 3.9 0.0 0:41.59 htop
+ 17444 sandcrawl 20 0 552M 50456 14892 S 3.9 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 18042 pdftrio 20 0 46068 6588 5328 R 3.3 0.0 0:00.05 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 18043 pdftrio 20 0 4 4 0 R 2.6 0.0 0:00.04
+ 2203 grobid 20 0 6334M 126M 4188 S 0.7 0.3 3h27:32 /usr/lib/jvm/java-8-openjdk-amd64/bin/java -XX:MaxMetas
+ 17419 sandcrawl 20 0 619M 116M 15248 S 0.7 0.2 0:02.68 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17440 sandcrawl 20 0 578M 76948 15160 S 0.7 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 13848 root 20 0 0 0 0 D 0.7 0.0 0:00.69 kworker/u60:1
+ 17443 sandcrawl 20 0 578M 76500 14912 S 0.7 0.1 0:01.74 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17414 sandcrawl 20 0 580M 77720 15036 S 0.0 0.2 0:01.77 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17432 sandcrawl 20 0 563M 61460 14976 S 0.0 0.1 0:01.59 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17442 sandcrawl 20 0 561M 53096 15240 S 0.0 0.1 0:01.47 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17433 sandcrawl 20 0 559M 57160 15176 S 0.0 0.1 0:01.52 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17431 sandcrawl 20 0 554M 50960 14892 S 0.0 0.1 0:01.37 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17413 sandcrawl 20 0 554M 52376 14920 S 0.0 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+
+dstat:
+
+ ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+ usr sys idl wai hiq siq| read writ| recv send| in out | int csw
+ 32 9 22 37 0 0| 0 37M| 20M 12M| 0 0 | 35k 64k
+ 20 6 24 50 0 0| 0 20M| 30M 5662k| 0 0 | 27k 48k
+ 27 7 24 43 0 0| 0 26M|8712k 6289k| 0 0 | 21k 114k
+ 30 8 23 38 0 0|4096B 61M| 17M 20M| 0 0 | 31k 54k
+ 33 6 17 44 0 0| 0 32M| 14M 6384k| 0 0 | 27k 46k
+ 25 6 24 44 0 0| 0 19M| 18M 13M| 0 0 | 27k 179k
+ 40 6 19 35 0 0|8192B 25M|7855k 6661k| 0 0 | 31k 85k
+ 59 8 12 20 0 0| 0 39M|4177k 33M| 0 0 | 34k 64k
+ 34 4 17 44 0 0| 0 16M|7527k 11M| 0 0 | 22k 45k
+ 44 7 17 32 0 0| 0 30M| 20M 291k| 0 0 | 36k 62k
+
+Create tmpfs:
+
+ sudo mkdir -p /pdftrio-ramdisk
+ #sudo mount -t tmpfs -o size=2g tmpfs /pdftrio-ramdisk
+ sudo mount -t tmpfs -o size=6g tmpfs /pdftrio-ramdisk
+
+add to pdftrio config env and restart:
+
+ TEMP=/run/pdf_trio
+
+Seems to have worked. Pretty much maxed CPU, may need to back-off parallelism. Doing more than 31/sec.
+
+Errors were not getting encoded correctly:
+
+ File "/fast/sandcrawler/python/sandcrawler/persist.py", line 331, in push_batch
+ r['pdf_trio']['key'] = r['key']
+ KeyError: 'pdf_trio'
+
+Fixed in sandcrawler worker, and patched persist to work around this.
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Wow, 30x parallelism waaaay less?
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+What changed? Confused. Load average was like 40.
+
+Via kafka, as much as 69.71/sec! Errors?
+
+Hrm, this whole `auto` thing. I am very skeptical. Should also do a run as `all`, -j20.
+
+ Worker: Counter({'total': 1916, 'pushed': 1916})
+ CDX lines pushed: Counter({'total': 1934, 'pushed': 1916, 'skip-parse': 18})
+
+Hit some bugs, causing failure, but still seem to have processed a good chunk.
+
+Switched to `all`, running a different batch:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1914.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+After flag change, another batch in `all`:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-2566.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md
new file mode 100644
index 0000000..70e2b59
--- /dev/null
+++ b/notes/tasks/2020-07-22_processing_holes.md
@@ -0,0 +1,120 @@
+
+Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`)
+in sandcrawler database.
+
+
+## `pdf_meta` for petabox rows
+
+Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file.
+
+ wc -l dump_unextracted_pdf_petabox.2020-07-22.json
+ 1503086 dump_unextracted_pdf_petabox.2020-07-22.json
+
+Great, 1.5 million, not too many. Start small:
+
+ head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Full batch:
+
+ cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf_petabox.2020-08-19.json
+ 971194 dump_unextracted_pdf_petabox.2020-08-19.json
+
+## `pdf_meta` missing CDX rows
+
+First, the GROBID-ized rows but only if has a fatcat file as well.
+
+10,755,365! That is a lot still to process.
+
+ cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf.fatcat.2020-08-19.json
+ 65517 dump_unextracted_pdf.fatcat.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` missing petabox rows
+
+ wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+ 972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+
+Start small:
+
+ head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Full batch:
+
+ cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_ungrobided_pdf_petabox.2020-08-19.json
+ 933 dump_ungrobided_pdf_petabox.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` for missing CDX rows in fatcat
+
+ wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json
+ 1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json
+
+Full batch:
+
+ cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## `GROBID` for bad status
+
+Eg, wayback errors.
+
+TODO
+
+## `pdf_trio` for OA journal crawls
+
+TODO
+
+## `pdf_trio` for "included by heuristic", not in fatcat
+
+TODO
+
+## Live-ingest missing arxiv papers
+
+ ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json
+ => Expecting 1505184 release objects in search queries
+
+ cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22
+
+Repeating this every few days should (?) result in all the backlog of arxiv
+papers getting indexed. Could focus on recent years to start (with query
+filter).
+
+## re-ingest spn2 errors (all time)
+
+Eg:
+
+ spn2-cdx-lookup-failure: 143963
+ spn-error: 101773
+ spn2-error: 16342
+
+TODO
+
+## re-try CDX errors
+
+Eg, for unpaywall only, bulk ingest all `cdx-error`.
+
+TODO
+
+## live ingest unpaywall `no-capture` URLs
+
+After re-trying the CDX errors for unpaywall URLs (see above), count all the
+no-capture URLs, and if reasonable recrawl them all in live more ("reasonable"
+meaning fewer than 200k or so URLs).
+
+Could also force recrawl (not using CDX lookups) for some publisher platforms
+if that made sense.
+
+TODO
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md
new file mode 100644
index 0000000..39c84dd
--- /dev/null
+++ b/notes/tasks/2020-08-20_file_meta.md
@@ -0,0 +1,66 @@
+
+Want to update fatcat file entities with "full" file metadata for those which are missing it.
+
+How many `file_meta` rows *still* don't have metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 62962
+
+First generate list of sha1hex from most recent bulk export which are missing
+at least some metadata (based on missing sha256):
+
+ zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
+ => 18.7M 0:05:46 [53.8k/s]
+
+Then dump the entire sandcrawler `file_meta` table as TSV, with first column
+sha1hex and second column JSON with all the file metadata fields:
+
+ COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ WHERE sha256hex IS NOT NULL
+ ORDER BY sha1hex ASC
+ )
+ TO '/grande/snapshots/file_meta_dump.tsv'
+ WITH NULL '';
+
+Join/cut:
+
+ export LC_ALL=C
+ join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
+ => 18.1M 0:03:37 [83.2k/s]
+
+Check counts:
+
+ cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
+ => 18135313
+
+ zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
+ 18103860 application/pdf
+ 29977 application/octet-stream
+ 876 text/html
+ 199 application/postscript
+ 171 application/gzip
+ 84 text/plain
+ 48 application/xml
+ 38 application/vnd.ms-powerpoint
+ 16 application/msword
+ 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
+ 6 image/jpeg
+ 4 message/rfc822
+ 4 application/zip
+ 4 application/vnd.openxmlformats-officedocument.presentationml.presentation
+ 3 text/x-tex
+ 3 application/x-dosexec
+ 2 application/x-tar
+ 2 application/vnd.ms-tnef
+ 1 video/mpeg
+ 1 image/tiff
+ 1 image/svg+xml
+ 1 image/png
+ 1 image/gif
+ 1 audio/x-ape
+ 1 application/vnd.ms-office
+ 1 application/CDFV2-unknown
+
+TODO: fatcat importer
diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md
new file mode 100644
index 0000000..c0bb65e
--- /dev/null
+++ b/notes/tasks/2020-10-21_pdfextract_holes.md
@@ -0,0 +1,74 @@
+
+Realized I had not enabled persisting of PDF extraction results (thumbnail,
+text) in ingest worker when added over the summer. So now need to run a
+catch-up. This applied to both "live" and "bulk" ingest.
+
+## `cdx` / `ingest` / `grobid` catch-up
+
+First, re-run extraction for cases where we did an ingest, and grobid ran
+successfully, and we have a CDX row, but no `pdf_meta`:
+
+ -- this is a slow query
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+ WITH NULL '';
+ => 19,676,116
+
+Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably.
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And again, after a couple partitions got hung up:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json'
+ WITH NULL '';
+
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ => 562k 0:00:16 [34.6k/s]
+
+## `petabox` / `grobid` catch-up
+
+These didn't all seem to extract correctly before after 1.5m rows, there will
+still 900k unprocessed. Trying again.
+
+ COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json'
+ WITH NULL '';
+
+ cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## `cdx` / `grobid` catch-up
+
+Next will be to process PDFs with GROBID and CDX but no ingest.
+
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..cd8176e
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,70 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+ COPY (
+ SELECT
+ base_url as start_url,
+ terminal_url as pdf_url,
+ terminal_dt as pdf_url_timestamp,
+ terminal_sha1hex as pdf_sha1hex
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+ WITH NULL '';
+ => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.url as pdf_url,
+ cdx.datetime as pdf_url_timestamp,
+ cdx.sha1hex as pdf_sha1hex
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+ WITH NULL '';
+ => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md
new file mode 100644
index 0000000..94eefec
--- /dev/null
+++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md
@@ -0,0 +1,235 @@
+
+The current sandcrawler-db crossref table was backfilled from a 2021-01
+snapshot, and has not been updated since.
+
+Would like to use the existing fatcat Kafka feed to keep the crossref table up
+to date, and also backfill in GROBID reference parsing of all `unstructured`
+references.
+
+Current plan is:
+
+1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up
+ to some recent date
+2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump
+ into sandcrawler-db. this will update some rows multiple times (if there
+ have been updates)
+3. dump the full crossref table, as a point-in-time snapshot
+4. filter to crossref records that have `unstrutured` references in them (at
+ all)
+5. use `grobid_tool.py` with `parallel` to batch process references
+6. backfill these refs using a simple SQL COPY statement
+7. deploy crossref persist worker, with ref updates on, and roll the consumer
+ group back to date of dump
+8. wait for everything to catch up
+
+
+## Commands
+
+Get a timestamp in milliseconds:
+
+ 2021-01-01 is:
+ 1609488000 in unix time (seconds)
+ 1609488000000 in miliseconds
+
+Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due
+to topic compaction? Yup, we have a 180 day compaction policy on that topic,
+probably from when kafka space was tight. Oh well!
+
+Updated retention for this topic to `46656000000` (~540 days, ~18 months) using
+`kafka-manager` web app.
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \
+ | pv -l \
+ | gzip \
+ > crossref_feed_start20210428_end20211029.json.gz
+
+This resulted in ~36 million rows, 46GB.
+
+`scp` that around, then run persist on `sandcrawler-db`:
+
+ # in pipenv, as sandcrawler user
+ # manually edited to set batch size to 200
+ zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \
+ | pv -l \
+ | ./persist_tool.py crossref -
+ => 36.8M 11:02:43 [ 925 /s]
+
+With a single thread, the persist process runs at about 1,000 rows/sec, which
+works out to about 10 hours for 36 million rows.
+
+At the start of this process, total PostgreSQL database size is 832.21G. At the
+end, 902.51G. Have not run a `VACUUM ALL` or anything like that.
+
+Query to dump crossref rows which have any refs and compress output with pigz:
+
+ # dump_crossref.sql
+ COPY (
+ SELECT record
+ FROM crossref
+ WHERE record::jsonb @? '$.reference[*].unstructured'
+ -- LIMIT 5
+ )
+ TO STDOUT
+ WITH NULL '';
+
+ # 'sed' required because of double quote escaping in postgresql output::
+ # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069
+ # 'rg' filter is just being conservative
+
+ # XXX: next time add to the pipeline: rg -v "\\\\"
+ # or, find some way to filter/transform this kind of SQL export better?
+ psql sandcrawler < dump_crossref.sql \
+ | sed 's/\\"/\"/g' \
+ | rg '^\{' \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz
+ => 26.1M 3:22:51 [2.15k/s]
+
+ # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \
+ | rg -v "\\\\" \
+ | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz
+
+ # from earlier testing with -j40: able to do about 300-500 records/second
+ # 23.9k 0:01:14 [ 320 /s]
+ # 134518 total refs parsed
+ # ~1817 refs/second parsed
+
+ # with errors, got through about: 2.08M 1:38:20 [ 352 /s]
+ # was still seing bad JSON?
+ # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12})
+
+ # finally, without errors:
+ # 18.6M 8:35:02 [ 603 /s]
+
+In the next step, going to need a small direct persist worker to copy lines
+verbatim into just the `grobid_refs` table.
+
+## Errors
+
+Got errors when running for real:
+
+ xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33
+
+ requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList
+
+ urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
+
+
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs.
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498)
+ [...]
+
+Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019):
+
+ 'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.'
+ '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0'
+ 'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.',
+
+s.strip() in python would remove these non-breaking spaces (update: implemented this later)
+
+ Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640)
+
+Also:
+
+ truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281
+ truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129
+ truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968
+ truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238
+ truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177
+ truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133
+ truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279
+
+Seems like bumping to 2500 as the maximum reference list size might be
+reasonable (it is 2000 currently).
+
+After some refactoring, still getting:
+
+ requests.exceptions.ConnectionError
+
+This is because I am doing POST without a session.
+
+Then, still got requests.exceptions.ReadTimeout
+
+Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with
+only a few dozen rows like:
+
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2
+ GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970
+ truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010
+
+So things seem to be working!
+
+Summary lines looked like:
+
+ JSON lines pushed: Counter({'total': 531487, 'pushed': 531487})
+ Worker: Counter({'total': 536541, 'failed': 3})
+
+Failures per batch were on the order of 0 to 3.
+
+## Postgres Backfill
+
+Start with a sample:
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | head -n1000 \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0})
+
+ # same command again:
+ # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0})
+
+Example DOIs:
+
+ # no refs
+ 10.1007/978-1-349-04135-0_3
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3"
+
+ # with refs
+ 10.1007/978-1-349-03594-6_2
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2"
+
+Seems to be working, so will do the full backfill. Can check table sizes on a
+per-table basis when complete.
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | pv -l \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473})
+
+
+## Kafka Setup
+
+Added ansible config and deployed persist-crossref worker.
+
+First roll-back just a couple days as a test:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000
+
+ # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1})
+
+Then roll-back to before the snapshot and backfill, to catch up:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000
+
+Ran this last command on 2021-11-10, and total lag was around 2,566,741.
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..5fb69d1
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,380 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+ # IMPROVED BELOW
+ COPY (
+ SELECT row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+ cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+ # 302
+
+Nope! Need to add "distinct on":
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4297
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+ WITH NULL '';
+ # COPY 15488
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+ # TODO
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ pdf_meta.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.mimetype = 'application/pdf'
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+ WITH NULL '';
+ # COPY 45535
+
+ cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+ WITH NULL '';
+ # COPY 8,084,296
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+ WITH NULL '';
+ # COPY 529265
+
+That is a much more managable batch to retry.
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 354k 0:00:11 [30.6k/s]
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+ cat files_missing_sha256.ingest_results.json \
+ | rg '"application/pdf"' \
+ | rg -v "\\\\" \
+ | jq .cdx -c \
+ | sort -u -S 4G \
+ | pv -l \
+ > files_missing_sha256.cdx.uniq.json
+ # 100k 0:00:47 [2.09k/s]
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status_code = 200
+ AND grobid.status IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+ WITH NULL '';
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status = 'success'
+ AND grobid.grobid_version NOT LIKE '0.7.%'
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+ WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+ cd /srv/sandcrawler/tasks/
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+ | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+ SELECT file_meta.mimetype, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY file_meta.mimetype
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+----------
+ application/pdf | 45227033
+ | 433068
+ application/octet-stream | 30634
+ application/jats+xml | 6874
+ text/html | 876
+ application/postscript | 199
+ application/gzip | 173
+ text/plain | 84
+ application/xml | 48
+ application/vnd.ms-powerpoint | 38
+ application/msword | 16
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8
+ image/jpeg | 6
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 4
+ message/rfc822 | 4
+ application/zip | 4
+ text/x-tex | 3
+ application/x-dosexec | 3
+ application/x-tar | 2
+ application/vnd.ms-tnef | 2
+ image/svg+xml | 1
+ image/tiff | 1
+ image/png | 1
+ image/gif | 1
+ application/vnd.ms-office | 1
+ (25 rows)
+
+
+PDF extract status?
+
+ SELECT pdf_meta.status, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY pdf_meta.status
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | count
+ ----------------+----------
+ success | 43415920
+ | 2018522
+ text-too-large | 122730
+ parse-error | 94876
+ not-pdf | 32156
+ error-wayback | 14504
+ bad-unicode | 279
+ bad-pdf | 98
+ empty-blob | 2
+ (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+ SELECT grobid.status, grobid.status_code, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY grobid.status, grobid.status_code
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 44409069
+ error | 500 | 580402
+ | | 468836
+ | 200 | 240660
+ error-timeout | -4 | 79
+ bad-grobid-xml | 200 | 38
+ error | 200 | 3
+ (7 rows)
+
+Ran the same query again on 2021-12-15:
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 45092915
+ error | 500 | 302373
+ | | 250335
+ | 200 | 53352
+ bad-grobid-xml | 200 | 39
+ error-timeout | -4 | 37
+ error | 200 | 34
+ error | 503 | 2
+ (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+ WITH NULL '';
+ => COPY 8801527
+
+ cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+ (country_code:ua OR lang:uk)
+ => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+ later in day, already some 22k missing found! wow
+ => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+ x wikidata SPARQL update
+ x chocula run
+ x journal metadata update (fatcat)
+ x update journal stats (fatcat extra)
+- DOAJ article metadata import
+ x prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+ python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+ select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+ 1952
+
+ SELECT COUNT(*) FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+ => 1970
+
+ .mode csv
+ .once homepage_urls_ukraine.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+
+ .mode csv
+ .once homepage_urls_russia.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ru'
+ OR journal.lang = 'ru'
+ OR journal.name like '%russ%'
+ OR journal.publisher like '%russ%';
+
+ .mode csv
+ .once homepage_urls_belarus.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'by'
+ OR journal.lang = 'be'
+ OR journal.name like '%belarus%'
+ OR journal.publisher like '%belarus%';
+
+ cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+ 1971 homepage_urls_ukraine.tsv
+ 3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+ 3728 homepage_urls_russia.tsv
+ 2420 homepage_urls.2022-03-08.ru_tld.tsv
+ 6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+ cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+ 138 homepage_urls_belarus.tsv
+ 85 homepage_urls.2022-03-08.by_tld.tsv
+ 222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+ => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+ => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+ ./fatcat_ingest.py --env prod \
+ --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk"
+
+ # around Tue 08 Mar 2022 01:07:37 PM PST
+ # Expecting 185659 release objects in search queries
+ # didn't complete successfully? hrm
+
+ # ok, retry "manually" (with kafkacat)
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+ # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+ # 103k 0:25:04 [68.7 /s]
+
+ zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+ # 103k 0:00:02 [38.1k/s]
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:by OR lang:be" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+ # Expecting 2266 release objects in search queries
+ # 1.29k 0:00:34 [37.5 /s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ru OR lang:ru" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+ # Expecting 1515246 release objects in search queries
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+ # 309k 0:00:03 [81.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+ # 71.2k 0:00:03 [19.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+ # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+ zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 103k 0:00:02 [36.1k/s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 1.29k 0:00:00 [15.8k/s]
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+ export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+ COPY (
+ SELECT
+ terminal_sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+ ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ AND hit = true
+ ORDER BY terminal_sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+ ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ ORDER BY cdx.sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+ -m collection:ia_biblio_metadata \
+ -m title:"IA Scholarly URLs ($TASKDATE)" \
+ -m date:$TASKDATE \
+ -m creator:"Internet Archive Web Group" \
+ -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+ /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+ SELECT COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+ => 589
+
+ SELECT ingest_file_result.status, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY ingest_file_result.status;
+
+ status | count
+ ----------------+-------
+ cdx-error | 1
+ success | 587
+ wrong-mimetype | 1
+ (3 rows)
+
+
+ SELECT grobid.status_code, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY grobid.status_code;
+
+ status_code | count
+ -------------+-------
+ 200 | 385
+ 500 | 202
+ | 2
+ (3 rows)
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 500
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+ => COPY 202
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 200
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+ => COPY 385
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ | jq '. + {force_recrawl: true}' -c \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n100 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n10000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1