From 5dd8785d710cf7d067afdc691069bfa74406e06a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 May 2020 14:47:17 -0700
Subject: ingests: normalize file names; commit updates

---
 notes/ingest/2019-10-23_testing.md          |   8 +
 notes/ingest/20191023_testing.md            |   8 -
 notes/ingest/2020-01-14_bulk.md             |  26 ++
 notes/ingest/2020-02-14_unpaywall_ingest.md | 624 ----------------------------
 notes/ingest/2020-02_unpaywall.md           | 624 ++++++++++++++++++++++++++++
 notes/ingest/2020-03-04_mag.md              | 576 -------------------------
 notes/ingest/2020-03-oa_but_not_marked.md   |  25 ++
 notes/ingest/2020-03_mag.md                 | 576 +++++++++++++++++++++++++
 notes/ingest/2020-03_s2.md                  |  35 ++
 notes/ingest/2020-03_s2_ingest.md           |  35 --
 notes/ingest/2020-04-07_datacite.md         | 121 ------
 notes/ingest/2020-04-07_unpaywall.md        |  63 ---
 notes/ingest/2020-04_datacite.md            | 121 ++++++
 notes/ingest/2020-04_unpaywall.md           | 129 ++++++
 notes/ingest/2020-05_oai_pmh.md             | 125 ++++++
 notes/ingest/20200114_bulk_ingests.md       |  26 --
 16 files changed, 1669 insertions(+), 1453 deletions(-)
 create mode 100644 notes/ingest/2019-10-23_testing.md
 delete mode 100644 notes/ingest/20191023_testing.md
 create mode 100644 notes/ingest/2020-01-14_bulk.md
 delete mode 100644 notes/ingest/2020-02-14_unpaywall_ingest.md
 create mode 100644 notes/ingest/2020-02_unpaywall.md
 delete mode 100644 notes/ingest/2020-03-04_mag.md
 create mode 100644 notes/ingest/2020-03-oa_but_not_marked.md
 create mode 100644 notes/ingest/2020-03_mag.md
 create mode 100644 notes/ingest/2020-03_s2.md
 delete mode 100644 notes/ingest/2020-03_s2_ingest.md
 delete mode 100644 notes/ingest/2020-04-07_datacite.md
 delete mode 100644 notes/ingest/2020-04-07_unpaywall.md
 create mode 100644 notes/ingest/2020-04_datacite.md
 create mode 100644 notes/ingest/2020-04_unpaywall.md
 create mode 100644 notes/ingest/2020-05_oai_pmh.md
 delete mode 100644 notes/ingest/20200114_bulk_ingests.md

(limited to 'notes')

diff --git a/notes/ingest/2019-10-23_testing.md b/notes/ingest/2019-10-23_testing.md
new file mode 100644
index 0000000..481c4e2
--- /dev/null
+++ b/notes/ingest/2019-10-23_testing.md
@@ -0,0 +1,8 @@
+
+exported not-archived DOIs for elife, as well as general list.
+
+    wc -l recent\ missing\ oa\ releases.csv
+    161828 recent missing oa releases.csv
+
+    wc -l missing\ elife\ DOIs.csv
+    1779 missing elife DOIs.csv
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md
deleted file mode 100644
index 481c4e2..0000000
--- a/notes/ingest/20191023_testing.md
+++ /dev/null
@@ -1,8 +0,0 @@
-
-exported not-archived DOIs for elife, as well as general list.
-
-    wc -l recent\ missing\ oa\ releases.csv
-    161828 recent missing oa releases.csv
-
-    wc -l missing\ elife\ DOIs.csv
-    1779 missing elife DOIs.csv
diff --git a/notes/ingest/2020-01-14_bulk.md b/notes/ingest/2020-01-14_bulk.md
new file mode 100644
index 0000000..9d05cda
--- /dev/null
+++ b/notes/ingest/2020-01-14_bulk.md
@@ -0,0 +1,26 @@
+
+Generate ingest requests from arabesque:
+
+    zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json
+
+    zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json
+
+
+Quick tests locally:
+
+    time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json
+    time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json
+
+These are all wayback success; looking good! Single threaded, from home laptop
+(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even
+with 30x parallelism. Should re-test on actual server. GROBID pre-check should
+help?
+
+With new bulk topic:
+
+    head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Ok, let them rip:
+
+    cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02-14_unpaywall_ingest.md
deleted file mode 100644
index e18a2ff..0000000
--- a/notes/ingest/2020-02-14_unpaywall_ingest.md
+++ /dev/null
@@ -1,624 +0,0 @@
-
-## Stats and Things
-
-    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt
-
-## Transform
-
-    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null
-    => 22M 1:31:25 [   4k/s]
-
-Shard it into batches of roughly 1 million (all are 1098096 +/- 1):
-
-    zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json
-
-Test ingest:
-
-    head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Add a single batch like:
-
-    cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Progress/Status
-
-There are 21,961,928 lines total, in batches of 1,098,097.
-
-    unpaywall_snapshot_2019-11-22.ingest_request.split_00.json
-        => 2020-02-24 21:05 local: 1,097,523    ~22 results/sec (combined)
-        => 2020-02-25 10:35 local: 0
-    unpaywall_snapshot_2019-11-22.ingest_request.split_01.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_02.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_03.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_04.json
-        => 2020-02-25 11:26 local: 4,388,997
-        => 2020-02-25 10:14 local: 1,115,821
-        => 2020-02-26 16:00 local:   265,116
-    unpaywall_snapshot_2019-11-22.ingest_request.split_05.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_06.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_07.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_08.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_09.json
-        => 2020-02-26 16:01 local: 6,843,708
-        => 2020-02-26 16:31 local: 4,839,618
-        => 2020-02-28 10:30 local: 2,619,319
-    unpaywall_snapshot_2019-11-22.ingest_request.split_10.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_11.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_12.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_13.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_14.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_15.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_16.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_17.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_18.json
-    unpaywall_snapshot_2019-11-22.ingest_request.split_19.json
-        => 2020-02-28 10:50 local: 13,551,887
-        => 2020-03-01 23:38 local:  4,521,076
-        => 2020-03-02 10:45 local:  2,827,071
-        => 2020-03-02 21:06 local:  1,257,176
-    added about 500k bulk re-ingest to try and work around cdx errors
-        => 2020-03-02 21:30 local:  1,733,654
-
-## Investigate Failures
-
-Guessing than some domains are ultimately going to need direct "recrawl" via
-SPNv2.
-
-    -- top domain failures for unpaywall GWB history ingest
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-        AND t1.status != 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                  domain               |       status        | count  
-    -----------------------------------+---------------------+--------
-     watermark.silverchair.com         | terminal-bad-status | 258432
-     www.tandfonline.com               | no-pdf-link         | 203873
-     journals.sagepub.com              | no-pdf-link         | 126317
-     iopscience.iop.org                | terminal-bad-status | 112526
-     files-journal-api.frontiersin.org | terminal-bad-status | 112499
-     pubs.acs.org                      | no-pdf-link         |  94772
-     www.degruyter.com                 | redirect-loop       |  89801
-     www.ahajournals.org               | no-pdf-link         |  84025
-     society.kisti.re.kr               | no-pdf-link         |  72849
-     www.nature.com                    | redirect-loop       |  53575
-     babel.hathitrust.org              | terminal-bad-status |  41063
-     www.ncbi.nlm.nih.gov              | redirect-loop       |  40363
-     scialert.net                      | no-pdf-link         |  38340
-     www.degruyter.com                 | terminal-bad-status |  34913
-     www.journal.csj.jp                | no-pdf-link         |  30881
-     espace.library.uq.edu.au          | redirect-loop       |  24570
-     www.jci.org                       | redirect-loop       |  24409
-     aip.scitation.org                 | wrong-mimetype      |  22144
-     www.vr-elibrary.de                | no-pdf-link         |  17436
-     www.biorxiv.org                   | wrong-mimetype      |  15524
-     ajph.aphapublications.org         | no-pdf-link         |  15083
-     zookeys.pensoft.net               | redirect-loop       |  14867
-     dialnet.unirioja.es               | redirect-loop       |  14486
-     asa.scitation.org                 | wrong-mimetype      |  14261
-     www.nrcresearchpress.com          | no-pdf-link         |  14254
-     dl.acm.org                        | redirect-loop       |  14223
-     osf.io                            | redirect-loop       |  14103
-     www.oecd-ilibrary.org             | redirect-loop       |  12835
-     journals.sagepub.com              | redirect-loop       |  12229
-     iopscience.iop.org                | redirect-loop       |  11825
-    (30 rows)
-
-    -- top no-capture terminal domains
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status = 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-    => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled
-
-    -- top no-capture base domains
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status = 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                domain            |   status   | count  
-    ------------------------------+------------+--------
-     academic.oup.com             | no-capture | 429888
-     www.nature.com               | no-capture | 273825
-     dergipark.org.tr             | no-capture | 119847
-     www.biodiversitylibrary.org  | no-capture | 110220
-     escholarship.org             | no-capture | 106307
-     onlinelibrary.wiley.com      | no-capture |  89771
-     journals.sagepub.com         | no-capture |  79297
-     www.cell.com                 | no-capture |  64242
-     deepblue.lib.umich.edu       | no-capture |  58080
-     babel.hathitrust.org         | no-capture |  52286
-     hal.archives-ouvertes.fr     | no-capture |  48549
-     iopscience.iop.org           | no-capture |  42591
-     dash.harvard.edu             | no-capture |  40767
-     www.tandfonline.com          | no-capture |  40638
-     discovery.ucl.ac.uk          | no-capture |  40633
-     www.jstage.jst.go.jp         | no-capture |  39780
-     www.doiserbia.nb.rs          | no-capture |  39261
-     dspace.mit.edu               | no-capture |  37703
-     zookeys.pensoft.net          | no-capture |  34562
-     repositorio.unesp.br         | no-capture |  34437
-     ashpublications.org          | no-capture |  34112
-     www.cambridge.org            | no-capture |  33959
-     kclpure.kcl.ac.uk            | no-capture |  31455
-     society.kisti.re.kr          | no-capture |  30427
-     pure.mpg.de                  | no-capture |  27650
-     download.atlantis-press.com  | no-capture |  27253
-     dialnet.unirioja.es          | no-capture |  26886
-     link.springer.com            | no-capture |  26257
-     www.valueinhealthjournal.com | no-capture |  24798
-     dspace.library.uu.nl         | no-capture |  23234
-    (30 rows)
-
-    -- top no-capture base domains
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status = 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                domain            |   status   | count
-    ------------------------------+------------+--------
-     academic.oup.com             | no-capture | 429888
-     www.nature.com               | no-capture | 273825
-     dergipark.org.tr             | no-capture | 119847
-     www.biodiversitylibrary.org  | no-capture | 110220
-     escholarship.org             | no-capture | 106307
-     onlinelibrary.wiley.com      | no-capture |  89771
-     journals.sagepub.com         | no-capture |  79297
-     www.cell.com                 | no-capture |  64242
-     deepblue.lib.umich.edu       | no-capture |  58080
-     babel.hathitrust.org         | no-capture |  52286
-     hal.archives-ouvertes.fr     | no-capture |  48549
-     iopscience.iop.org           | no-capture |  42591
-     dash.harvard.edu             | no-capture |  40767
-     www.tandfonline.com          | no-capture |  40638
-     discovery.ucl.ac.uk          | no-capture |  40633
-     www.jstage.jst.go.jp         | no-capture |  39780
-     www.doiserbia.nb.rs          | no-capture |  39261
-     dspace.mit.edu               | no-capture |  37703
-     zookeys.pensoft.net          | no-capture |  34562
-     repositorio.unesp.br         | no-capture |  34437
-     ashpublications.org          | no-capture |  34112
-     www.cambridge.org            | no-capture |  33959
-     kclpure.kcl.ac.uk            | no-capture |  31455
-     society.kisti.re.kr          | no-capture |  30427
-     pure.mpg.de                  | no-capture |  27650
-     download.atlantis-press.com  | no-capture |  27253
-     dialnet.unirioja.es          | no-capture |  26886
-     link.springer.com            | no-capture |  26257
-     www.valueinhealthjournal.com | no-capture |  24798
-     dspace.library.uu.nl         | no-capture |  23234
-    (30 rows)
-
-    -- how many ingest requests not crawled at all?
-    SELECT count(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-        AND ingest_file_result.status IS NULL;
-    => 0
-
-    -- "cookie absent" terminal pages, by domain
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-        AND t1.status != 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                 domain             |     status     | count  
-    --------------------------------+----------------+--------
-     journals.sagepub.com           | no-pdf-link    | 126295
-     www.tandfonline.com            | no-pdf-link    | 116690
-     pubs.acs.org                   | no-pdf-link    |  94619
-     www.ahajournals.org            | no-pdf-link    |  84016
-     www.journal.csj.jp             | no-pdf-link    |  30881
-     aip.scitation.org              | wrong-mimetype |  22143
-     www.vr-elibrary.de             | no-pdf-link    |  17436
-     ajph.aphapublications.org      | no-pdf-link    |  15080
-     asa.scitation.org              | wrong-mimetype |  14261
-     www.nrcresearchpress.com       | no-pdf-link    |  14253
-     journals.ametsoc.org           | no-pdf-link    |  10500
-     www.journals.uchicago.edu      | no-pdf-link    |   6917
-     www.icevirtuallibrary.com      | no-pdf-link    |   6484
-     www.journals.uchicago.edu      | wrong-mimetype |   6191
-     www.healthaffairs.org          | no-pdf-link    |   5732
-     pubsonline.informs.org         | no-pdf-link    |   5672
-     pinnacle-secure.allenpress.com | no-pdf-link    |   5013
-     www.worldscientific.com        | no-pdf-link    |   4560
-     www.ajronline.org              | wrong-mimetype |   4523
-     ehp.niehs.nih.gov              | no-pdf-link    |   4514
-     www.future-science.com         | no-pdf-link    |   4091
-     pubs.acs.org                   | wrong-mimetype |   4015
-     aip.scitation.org              | no-pdf-link    |   3916
-     www.futuremedicine.com         | no-pdf-link    |   3821
-     asa.scitation.org              | no-pdf-link    |   3644
-     www.liebertpub.com             | no-pdf-link    |   3345
-     physicstoday.scitation.org     | no-pdf-link    |   3005
-     pubs.cif-ifc.org               | no-pdf-link    |   2761
-     epubs.siam.org                 | wrong-mimetype |   2583
-     www.ajronline.org              | no-pdf-link    |   2563
-    (30 rows)
-
-    -- "cookie absent" terminal pages, by domain
-    SELECT count(*)
-    FROM ingest_file_result
-    LEFT JOIN ingest_request
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_file_result.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-        AND ingest_file_result.status != 'success'
-        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent';
-
-    => 654885
-
-    -- NOT "cookie absent" terminal page failures, total count
-    SELECT count(*)
-    FROM ingest_file_result
-    LEFT JOIN ingest_request
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_file_result.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-        AND ingest_file_result.status != 'success'
-        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent';
-
-    => 1403837
-
-Looks like these domains are almost all "cookieAbsent" blocking:
-- journals.sagepub.com
-- pubs.acs.org
-- ahajournals.org
-- www.journal.csj.jp
-- aip.scitation.org
-
-Grab some individual URLs to test:
-
-    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
-    FROM ingest_file_result
-    LEFT JOIN ingest_request
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_file_result.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-        AND ingest_file_result.status != 'success'
-        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
-    ORDER BY updated DESC
-    LIMIT 25;
-
-NOT cookieAbsent testing with regular ingest tool:
-- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success
-- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes
-- osf.io success
-
-    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
-    FROM ingest_file_result
-    LEFT JOIN ingest_request
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_file_result.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-        AND ingest_file_result.status != 'success'
-        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
-    ORDER BY updated DESC
-    LIMIT 25;
-
-cookieAbsent testing with regular ingest tool:
-- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works
-
-The main distinguisher is status. terminal-bad-status can be ingested (live)
-successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled.
-
-## Heritrix Plan
-
-Generate following ingest request batches:
-
-- no-capture status from unpaywall
-- all other failures except /cookieAbsent
-- /cookieAbsent failures
-
-Plan will be to crawl no-capture first (to completion), then try the other
-non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2.
-
-Because there are so few "no-capture on second hop" cases, will not enqueue
-both terminal urls and base urls, only base urls.
-
-Should definitely skip/filter:
-
-- www.ncbi.nlm.nih.gov
-
-## Ingest Request Export
-
-    COPY (
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status = 'no-capture'
-    ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json';
-    => 4,855,142
-
-    COPY (
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status != 'success'
-            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
-    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json';
-    => 1,403,837
-
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json
-
-Note: will probably end up re-running the below after crawling+ingesting the above:
-
-    COPY (
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status != 'success'
-            AND ingest_file_result.status = 'terminal-bad-status'
-            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
-    ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json';
-    => 0
-
-    COPY (
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status != 'success'
-            AND ingest_file_result.status != 'terminal-bad-status'
-            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
-    ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
-    => 654,885
-
-## Batch Ingest
-
-Test small batch:
-
-    head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Full batch:
-
-    cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-    # there was a broken line in there, so...
-    # parse error: Expected separator between values at line 1367873, column 175
-    # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
-    tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Note that the crawl is not entirely complete and not all CDX seem to have been
-loaded, so may need to iterate. About 10% are still "no capture". May want or
-need to additionally crawl the terminal URLs, not the base URLs.
-
-## Post-ingest stats
-
-Overall status:
-
-    SELECT ingest_file_result.status, COUNT(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'unpaywall'
-    GROUP BY status
-    ORDER BY COUNT DESC
-    LIMIT 20;
-
-             status          |  count
-    -------------------------+----------
-     success                 | 17354494
-     no-pdf-link             |  1471076
-     no-capture              |  1135992
-     redirect-loop           |   837842
-     terminal-bad-status     |   803081
-     cdx-error               |   219746
-     wrong-mimetype          |   100723
-     link-loop               |    16013
-     wayback-error           |    12448
-     null-body               |     9444
-     redirects-exceeded      |      600
-     petabox-error           |      411
-     bad-redirect            |       17
-     bad-gzip-encoding       |        4
-     spn2-cdx-lookup-failure |        3
-     gateway-timeout         |        1
-     spn2-error:job-failed   |        1
-     spn2-error              |        1
-    (18 rows)
-
-Failures by domain:
-
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-        AND t1.status != 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                  domain               |       status        | count
-    -----------------------------------+---------------------+--------
-     academic.oup.com                  | no-pdf-link         | 330211
-     watermark.silverchair.com         | terminal-bad-status | 324599
-     www.tandfonline.com               | no-pdf-link         | 242724
-     journals.sagepub.com              | no-pdf-link         | 202050
-     iopscience.iop.org                | terminal-bad-status | 144063
-     files-journal-api.frontiersin.org | terminal-bad-status | 121719
-     pubs.acs.org                      | no-pdf-link         | 104535
-     www.ahajournals.org               | no-pdf-link         | 102653
-     society.kisti.re.kr               | no-pdf-link         | 101787
-     www.degruyter.com                 | redirect-loop       |  95130
-     www.nature.com                    | redirect-loop       |  87534
-     onlinelibrary.wiley.com           | no-pdf-link         |  84432
-     www.cell.com                      | redirect-loop       |  61496
-     www.degruyter.com                 | terminal-bad-status |  42919
-     babel.hathitrust.org              | terminal-bad-status |  41813
-     www.ncbi.nlm.nih.gov              | redirect-loop       |  40488
-     scialert.net                      | no-pdf-link         |  38341
-     ashpublications.org               | no-pdf-link         |  34889
-     dialnet.unirioja.es               | terminal-bad-status |  32076
-     www.journal.csj.jp                | no-pdf-link         |  30881
-     pure.mpg.de                       | redirect-loop       |  26163
-     www.jci.org                       | redirect-loop       |  24701
-     espace.library.uq.edu.au          | redirect-loop       |  24591
-     www.valueinhealthjournal.com      | redirect-loop       |  23740
-     www.vr-elibrary.de                | no-pdf-link         |  23332
-     aip.scitation.org                 | wrong-mimetype      |  22144
-     osf.io                            | redirect-loop       |  18513
-     www.journals.elsevier.com         | no-pdf-link         |  16710
-     www.spandidos-publications.com    | redirect-loop       |  15711
-     www.biorxiv.org                   | wrong-mimetype      |  15513
-    (30 rows)
-
-Dump lists for another iteration of bulk ingest:
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status = 'no-capture'
-    ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
-    => 278,876
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND ingest_file_result.status != 'success'
-            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
-    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
-    =>
-
-
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
-
-    cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md
new file mode 100644
index 0000000..e18a2ff
--- /dev/null
+++ b/notes/ingest/2020-02_unpaywall.md
@@ -0,0 +1,624 @@
+
+## Stats and Things
+
+    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt
+
+## Transform
+
+    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null
+    => 22M 1:31:25 [   4k/s]
+
+Shard it into batches of roughly 1 million (all are 1098096 +/- 1):
+
+    zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json
+
+Test ingest:
+
+    head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Add a single batch like:
+
+    cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Progress/Status
+
+There are 21,961,928 lines total, in batches of 1,098,097.
+
+    unpaywall_snapshot_2019-11-22.ingest_request.split_00.json
+        => 2020-02-24 21:05 local: 1,097,523    ~22 results/sec (combined)
+        => 2020-02-25 10:35 local: 0
+    unpaywall_snapshot_2019-11-22.ingest_request.split_01.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_02.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_03.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_04.json
+        => 2020-02-25 11:26 local: 4,388,997
+        => 2020-02-25 10:14 local: 1,115,821
+        => 2020-02-26 16:00 local:   265,116
+    unpaywall_snapshot_2019-11-22.ingest_request.split_05.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_06.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_07.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_08.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_09.json
+        => 2020-02-26 16:01 local: 6,843,708
+        => 2020-02-26 16:31 local: 4,839,618
+        => 2020-02-28 10:30 local: 2,619,319
+    unpaywall_snapshot_2019-11-22.ingest_request.split_10.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_11.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_12.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_13.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_14.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_15.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_16.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_17.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_18.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_19.json
+        => 2020-02-28 10:50 local: 13,551,887
+        => 2020-03-01 23:38 local:  4,521,076
+        => 2020-03-02 10:45 local:  2,827,071
+        => 2020-03-02 21:06 local:  1,257,176
+    added about 500k bulk re-ingest to try and work around cdx errors
+        => 2020-03-02 21:30 local:  1,733,654
+
+## Investigate Failures
+
+Guessing than some domains are ultimately going to need direct "recrawl" via
+SPNv2.
+
+    -- top domain failures for unpaywall GWB history ingest
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                  domain               |       status        | count  
+    -----------------------------------+---------------------+--------
+     watermark.silverchair.com         | terminal-bad-status | 258432
+     www.tandfonline.com               | no-pdf-link         | 203873
+     journals.sagepub.com              | no-pdf-link         | 126317
+     iopscience.iop.org                | terminal-bad-status | 112526
+     files-journal-api.frontiersin.org | terminal-bad-status | 112499
+     pubs.acs.org                      | no-pdf-link         |  94772
+     www.degruyter.com                 | redirect-loop       |  89801
+     www.ahajournals.org               | no-pdf-link         |  84025
+     society.kisti.re.kr               | no-pdf-link         |  72849
+     www.nature.com                    | redirect-loop       |  53575
+     babel.hathitrust.org              | terminal-bad-status |  41063
+     www.ncbi.nlm.nih.gov              | redirect-loop       |  40363
+     scialert.net                      | no-pdf-link         |  38340
+     www.degruyter.com                 | terminal-bad-status |  34913
+     www.journal.csj.jp                | no-pdf-link         |  30881
+     espace.library.uq.edu.au          | redirect-loop       |  24570
+     www.jci.org                       | redirect-loop       |  24409
+     aip.scitation.org                 | wrong-mimetype      |  22144
+     www.vr-elibrary.de                | no-pdf-link         |  17436
+     www.biorxiv.org                   | wrong-mimetype      |  15524
+     ajph.aphapublications.org         | no-pdf-link         |  15083
+     zookeys.pensoft.net               | redirect-loop       |  14867
+     dialnet.unirioja.es               | redirect-loop       |  14486
+     asa.scitation.org                 | wrong-mimetype      |  14261
+     www.nrcresearchpress.com          | no-pdf-link         |  14254
+     dl.acm.org                        | redirect-loop       |  14223
+     osf.io                            | redirect-loop       |  14103
+     www.oecd-ilibrary.org             | redirect-loop       |  12835
+     journals.sagepub.com              | redirect-loop       |  12229
+     iopscience.iop.org                | redirect-loop       |  11825
+    (30 rows)
+
+    -- top no-capture terminal domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+    => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled
+
+    -- top no-capture base domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                domain            |   status   | count  
+    ------------------------------+------------+--------
+     academic.oup.com             | no-capture | 429888
+     www.nature.com               | no-capture | 273825
+     dergipark.org.tr             | no-capture | 119847
+     www.biodiversitylibrary.org  | no-capture | 110220
+     escholarship.org             | no-capture | 106307
+     onlinelibrary.wiley.com      | no-capture |  89771
+     journals.sagepub.com         | no-capture |  79297
+     www.cell.com                 | no-capture |  64242
+     deepblue.lib.umich.edu       | no-capture |  58080
+     babel.hathitrust.org         | no-capture |  52286
+     hal.archives-ouvertes.fr     | no-capture |  48549
+     iopscience.iop.org           | no-capture |  42591
+     dash.harvard.edu             | no-capture |  40767
+     www.tandfonline.com          | no-capture |  40638
+     discovery.ucl.ac.uk          | no-capture |  40633
+     www.jstage.jst.go.jp         | no-capture |  39780
+     www.doiserbia.nb.rs          | no-capture |  39261
+     dspace.mit.edu               | no-capture |  37703
+     zookeys.pensoft.net          | no-capture |  34562
+     repositorio.unesp.br         | no-capture |  34437
+     ashpublications.org          | no-capture |  34112
+     www.cambridge.org            | no-capture |  33959
+     kclpure.kcl.ac.uk            | no-capture |  31455
+     society.kisti.re.kr          | no-capture |  30427
+     pure.mpg.de                  | no-capture |  27650
+     download.atlantis-press.com  | no-capture |  27253
+     dialnet.unirioja.es          | no-capture |  26886
+     link.springer.com            | no-capture |  26257
+     www.valueinhealthjournal.com | no-capture |  24798
+     dspace.library.uu.nl         | no-capture |  23234
+    (30 rows)
+
+    -- top no-capture base domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                domain            |   status   | count
+    ------------------------------+------------+--------
+     academic.oup.com             | no-capture | 429888
+     www.nature.com               | no-capture | 273825
+     dergipark.org.tr             | no-capture | 119847
+     www.biodiversitylibrary.org  | no-capture | 110220
+     escholarship.org             | no-capture | 106307
+     onlinelibrary.wiley.com      | no-capture |  89771
+     journals.sagepub.com         | no-capture |  79297
+     www.cell.com                 | no-capture |  64242
+     deepblue.lib.umich.edu       | no-capture |  58080
+     babel.hathitrust.org         | no-capture |  52286
+     hal.archives-ouvertes.fr     | no-capture |  48549
+     iopscience.iop.org           | no-capture |  42591
+     dash.harvard.edu             | no-capture |  40767
+     www.tandfonline.com          | no-capture |  40638
+     discovery.ucl.ac.uk          | no-capture |  40633
+     www.jstage.jst.go.jp         | no-capture |  39780
+     www.doiserbia.nb.rs          | no-capture |  39261
+     dspace.mit.edu               | no-capture |  37703
+     zookeys.pensoft.net          | no-capture |  34562
+     repositorio.unesp.br         | no-capture |  34437
+     ashpublications.org          | no-capture |  34112
+     www.cambridge.org            | no-capture |  33959
+     kclpure.kcl.ac.uk            | no-capture |  31455
+     society.kisti.re.kr          | no-capture |  30427
+     pure.mpg.de                  | no-capture |  27650
+     download.atlantis-press.com  | no-capture |  27253
+     dialnet.unirioja.es          | no-capture |  26886
+     link.springer.com            | no-capture |  26257
+     www.valueinhealthjournal.com | no-capture |  24798
+     dspace.library.uu.nl         | no-capture |  23234
+    (30 rows)
+
+    -- how many ingest requests not crawled at all?
+    SELECT count(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status IS NULL;
+    => 0
+
+    -- "cookie absent" terminal pages, by domain
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                 domain             |     status     | count  
+    --------------------------------+----------------+--------
+     journals.sagepub.com           | no-pdf-link    | 126295
+     www.tandfonline.com            | no-pdf-link    | 116690
+     pubs.acs.org                   | no-pdf-link    |  94619
+     www.ahajournals.org            | no-pdf-link    |  84016
+     www.journal.csj.jp             | no-pdf-link    |  30881
+     aip.scitation.org              | wrong-mimetype |  22143
+     www.vr-elibrary.de             | no-pdf-link    |  17436
+     ajph.aphapublications.org      | no-pdf-link    |  15080
+     asa.scitation.org              | wrong-mimetype |  14261
+     www.nrcresearchpress.com       | no-pdf-link    |  14253
+     journals.ametsoc.org           | no-pdf-link    |  10500
+     www.journals.uchicago.edu      | no-pdf-link    |   6917
+     www.icevirtuallibrary.com      | no-pdf-link    |   6484
+     www.journals.uchicago.edu      | wrong-mimetype |   6191
+     www.healthaffairs.org          | no-pdf-link    |   5732
+     pubsonline.informs.org         | no-pdf-link    |   5672
+     pinnacle-secure.allenpress.com | no-pdf-link    |   5013
+     www.worldscientific.com        | no-pdf-link    |   4560
+     www.ajronline.org              | wrong-mimetype |   4523
+     ehp.niehs.nih.gov              | no-pdf-link    |   4514
+     www.future-science.com         | no-pdf-link    |   4091
+     pubs.acs.org                   | wrong-mimetype |   4015
+     aip.scitation.org              | no-pdf-link    |   3916
+     www.futuremedicine.com         | no-pdf-link    |   3821
+     asa.scitation.org              | no-pdf-link    |   3644
+     www.liebertpub.com             | no-pdf-link    |   3345
+     physicstoday.scitation.org     | no-pdf-link    |   3005
+     pubs.cif-ifc.org               | no-pdf-link    |   2761
+     epubs.siam.org                 | wrong-mimetype |   2583
+     www.ajronline.org              | no-pdf-link    |   2563
+    (30 rows)
+
+    -- "cookie absent" terminal pages, by domain
+    SELECT count(*)
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent';
+
+    => 654885
+
+    -- NOT "cookie absent" terminal page failures, total count
+    SELECT count(*)
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent';
+
+    => 1403837
+
+Looks like these domains are almost all "cookieAbsent" blocking:
+- journals.sagepub.com
+- pubs.acs.org
+- ahajournals.org
+- www.journal.csj.jp
+- aip.scitation.org
+
+Grab some individual URLs to test:
+
+    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ORDER BY updated DESC
+    LIMIT 25;
+
+NOT cookieAbsent testing with regular ingest tool:
+- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success
+- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes
+- osf.io success
+
+    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ORDER BY updated DESC
+    LIMIT 25;
+
+cookieAbsent testing with regular ingest tool:
+- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works
+
+The main distinguisher is status. terminal-bad-status can be ingested (live)
+successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled.
+
+## Heritrix Plan
+
+Generate following ingest request batches:
+
+- no-capture status from unpaywall
+- all other failures except /cookieAbsent
+- /cookieAbsent failures
+
+Plan will be to crawl no-capture first (to completion), then try the other
+non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2.
+
+Because there are so few "no-capture on second hop" cases, will not enqueue
+both terminal urls and base urls, only base urls.
+
+Should definitely skip/filter:
+
+- www.ncbi.nlm.nih.gov
+
+## Ingest Request Export
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json';
+    => 4,855,142
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json';
+    => 1,403,837
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json
+
+Note: will probably end up re-running the below after crawling+ingesting the above:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.status = 'terminal-bad-status'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json';
+    => 0
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.status != 'terminal-bad-status'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
+    => 654,885
+
+## Batch Ingest
+
+Test small batch:
+
+    head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+    cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    # there was a broken line in there, so...
+    # parse error: Expected separator between values at line 1367873, column 175
+    # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
+    tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Note that the crawl is not entirely complete and not all CDX seem to have been
+loaded, so may need to iterate. About 10% are still "no capture". May want or
+need to additionally crawl the terminal URLs, not the base URLs.
+
+## Post-ingest stats
+
+Overall status:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+----------
+     success                 | 17354494
+     no-pdf-link             |  1471076
+     no-capture              |  1135992
+     redirect-loop           |   837842
+     terminal-bad-status     |   803081
+     cdx-error               |   219746
+     wrong-mimetype          |   100723
+     link-loop               |    16013
+     wayback-error           |    12448
+     null-body               |     9444
+     redirects-exceeded      |      600
+     petabox-error           |      411
+     bad-redirect            |       17
+     bad-gzip-encoding       |        4
+     spn2-cdx-lookup-failure |        3
+     gateway-timeout         |        1
+     spn2-error:job-failed   |        1
+     spn2-error              |        1
+    (18 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                  domain               |       status        | count
+    -----------------------------------+---------------------+--------
+     academic.oup.com                  | no-pdf-link         | 330211
+     watermark.silverchair.com         | terminal-bad-status | 324599
+     www.tandfonline.com               | no-pdf-link         | 242724
+     journals.sagepub.com              | no-pdf-link         | 202050
+     iopscience.iop.org                | terminal-bad-status | 144063
+     files-journal-api.frontiersin.org | terminal-bad-status | 121719
+     pubs.acs.org                      | no-pdf-link         | 104535
+     www.ahajournals.org               | no-pdf-link         | 102653
+     society.kisti.re.kr               | no-pdf-link         | 101787
+     www.degruyter.com                 | redirect-loop       |  95130
+     www.nature.com                    | redirect-loop       |  87534
+     onlinelibrary.wiley.com           | no-pdf-link         |  84432
+     www.cell.com                      | redirect-loop       |  61496
+     www.degruyter.com                 | terminal-bad-status |  42919
+     babel.hathitrust.org              | terminal-bad-status |  41813
+     www.ncbi.nlm.nih.gov              | redirect-loop       |  40488
+     scialert.net                      | no-pdf-link         |  38341
+     ashpublications.org               | no-pdf-link         |  34889
+     dialnet.unirioja.es               | terminal-bad-status |  32076
+     www.journal.csj.jp                | no-pdf-link         |  30881
+     pure.mpg.de                       | redirect-loop       |  26163
+     www.jci.org                       | redirect-loop       |  24701
+     espace.library.uq.edu.au          | redirect-loop       |  24591
+     www.valueinhealthjournal.com      | redirect-loop       |  23740
+     www.vr-elibrary.de                | no-pdf-link         |  23332
+     aip.scitation.org                 | wrong-mimetype      |  22144
+     osf.io                            | redirect-loop       |  18513
+     www.journals.elsevier.com         | no-pdf-link         |  16710
+     www.spandidos-publications.com    | redirect-loop       |  15711
+     www.biorxiv.org                   | wrong-mimetype      |  15513
+    (30 rows)
+
+Dump lists for another iteration of bulk ingest:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
+    => 278,876
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
+    =>
+
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
+
+    cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
deleted file mode 100644
index 428ce05..0000000
--- a/notes/ingest/2020-03-04_mag.md
+++ /dev/null
@@ -1,576 +0,0 @@
-
-Rough plan:
-
-- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
-- persist ingest requests (22 million or so)
-- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
-- crawl those which are no-capture
-
-
-## Generate Requests
-
-Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
-installed.
-
-Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
-(including DOI `ext_id` when available), with any dominant domains removed (eg,
-arxiv.org):
-
-    export LC_ALL=C
-    cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
-    => previously 25.6M
-    => 25.6M 2:29:43 [2.85k/s]
-
-    export LC_ALL=C
-    zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
-    => 4.3M 0:25:45 [2.78k/s]
-
-    export LC_ALL=C
-    cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
-
-    zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
-    => 6,504,907
-
-    zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
-    => 4,369,832
-
-    cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
-    => previously 15,707,405
-    => 15,702,581
-
-    cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
-    => 0
-    URL encoding seems to be working
-
-## Persist Ingest Requests
-
-First pmid ingest requests, then the all/doi file. The reason to do this order
-is that the all/doi file will have some rows with no DOI (and thus no
-`ext_id`), while the PMID file will not.
-
-    # small sample
-    head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
-    Worker: Counter({'total': 10, 'skip-result-fields': 10})
-    JSON lines pushed: Counter({'total': 10, 'pushed': 10})
-
-    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
-    => 4.3M 0:16:46 [4.27k/s]
-    Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
-    JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
-    => hit a bug on first attempt, which is why total/insert results don't match
-
-    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
-    => 25.6M 2:21:54 [3.01k/s]
-    Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
-    JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
-
-
-## Crawl/Dupe Status
-
-    SELECT ingest_file_result.status, COUNT(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'mag'
-    GROUP BY status
-    ORDER BY COUNT DESC
-    LIMIT 20;
-
-After just PMID links:
-
-           status        |  count
-    ---------------------+---------
-                         | 3000115
-     success             | 1126881
-     no-capture          |   69459
-     terminal-bad-status |   30259
-     redirect-loop       |   11656
-     no-pdf-link         |    2836
-     wrong-mimetype      |    1456
-     link-loop           |    1259
-     wayback-error       |    1232
-     cdx-error           |     932
-     null-body           |      85
-     petabox-error       |      50
-     bad-redirect        |       1
-    (13 rows)
-
-After all links:
-
-    SELECT COUNT(*)
-    FROM ingest_request
-    WHERE
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'mag';
-    => 25596563
-
-
-           status        |  count   
-    ---------------------+----------
-                         | 21130841
-     success             |  3915682
-     no-capture          |   391813
-     terminal-bad-status |    76488
-     redirect-loop       |    44202
-     wrong-mimetype      |    16418
-     no-pdf-link         |    10995
-     wayback-error       |     3679
-     cdx-error           |     3414
-     link-loop           |     2098
-     null-body           |      709
-     petabox-error       |      221
-     bad-gzip-encoding   |        2
-     bad-redirect        |        1
-    (14 rows)
-
-Somewhat more un-ingested than expected.
-
-Dump requests:
-
-    COPY (
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-            AND ingest_file_result.status IS NULL
-    ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
-    => COPY 21,130,841
-
-Transform and shuf:
-
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
-    => 21.1M 0:18:57 [18.6k/s]
-
-## Bulk Ingest Partner Output
-
-These are subsets of the full list from potential AIT-S partners; want to run
-these through the pipeline before the full batch. Duplication against the full
-batch should be minimal.
-
-Size:
-
-    bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
-    29007
-    bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
-    34265 ingest_requests_mag-2020-01-23.cornell.json
-
-Test ingest:
-
-    head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Full ingests:
-
-    cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-    cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-    cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-    cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-    cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Bulk Ingest
-
-Shard it into batches of roughly 1 million:
-
-    cd /grande/snapshots/
-    zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
-
-Add a single batch like:
-
-    cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-    partner ingests (see above)
-    => 2020-03-05 12:49:   118,396
-    1056543 mag_noingest_20200305.ingest_request.split_00.json
-    => 2020-03-05 14:34: 1,055,224
-    => check on stats/ratios; filter by ingest update time?
-    1056542 mag_noingest_20200305.ingest_request.split_01.json
-    1056542 mag_noingest_20200305.ingest_request.split_02.json
-    1056542 mag_noingest_20200305.ingest_request.split_03.json
-    1056542 mag_noingest_20200305.ingest_request.split_04.json
-    1056542 mag_noingest_20200305.ingest_request.split_05.json
-    1056542 mag_noingest_20200305.ingest_request.split_06.json
-    1056542 mag_noingest_20200305.ingest_request.split_07.json
-    1056542 mag_noingest_20200305.ingest_request.split_08.json
-    1056542 mag_noingest_20200305.ingest_request.split_09.json
-    => 2020-03-05 18:04: 10,009,297
-    => 2020-03-06 16:53:  6,553,946
-    1056542 mag_noingest_20200305.ingest_request.split_10.json
-    1056542 mag_noingest_20200305.ingest_request.split_11.json
-    1056542 mag_noingest_20200305.ingest_request.split_12.json
-    1056542 mag_noingest_20200305.ingest_request.split_13.json
-    1056542 mag_noingest_20200305.ingest_request.split_14.json
-    1056542 mag_noingest_20200305.ingest_request.split_15.json
-    1056542 mag_noingest_20200305.ingest_request.split_16.json
-    1056542 mag_noingest_20200305.ingest_request.split_17.json
-    1056542 mag_noingest_20200305.ingest_request.split_18.json
-    1056542 mag_noingest_20200305.ingest_request.split_19.json
-    => 2020-03-06 16:59: 17,001,032
-
-Stats from bulk ingest:
-
-    SELECT ingest_file_result.status, COUNT(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'mag'
-    GROUP BY status
-    ORDER BY COUNT DESC
-    LIMIT 20;
-
-           status        |  count
-    ---------------------+----------
-     no-capture          | 12237193
-     success             | 11991293
-     no-pdf-link         |   521691
-     redirect-loop       |   437192
-     terminal-bad-status |   231181
-     link-loop           |    92633
-     cdx-error           |    33631
-     wrong-mimetype      |    28638
-     wayback-error       |    19651
-     null-body           |     2682
-     petabox-error       |      727
-                         |       47
-     bad-redirect        |       44
-     bad-gzip-encoding   |        7
-    (14 rows)
-
-Failures by domain:
-
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-        AND t1.status != 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                    domain                |       status        | count  
-    --------------------------------------+---------------------+--------
-     dialnet.unirioja.es                  | redirect-loop       | 240967
-     onlinelibrary.wiley.com              | no-pdf-link         | 147696
-     agupubs.onlinelibrary.wiley.com      | no-pdf-link         |  72639
-     iopscience.iop.org                   | terminal-bad-status |  69591
-     febs.onlinelibrary.wiley.com         | no-pdf-link         |  49874
-     www.researchgate.net                 | redirect-loop       |  42859
-     journals.sagepub.com                 | no-pdf-link         |  27448
-     papers.ssrn.com                      | redirect-loop       |  27328
-     dialnet.unirioja.es                  | terminal-bad-status |  20320
-     physoc.onlinelibrary.wiley.com       | no-pdf-link         |  20232
-     science.sciencemag.org               | link-loop           |  17811
-     espace.library.uq.edu.au             | redirect-loop       |  17185
-     bpspubs.onlinelibrary.wiley.com      | no-pdf-link         |  15785
-     obgyn.onlinelibrary.wiley.com        | no-pdf-link         |  15301
-     anthrosource.onlinelibrary.wiley.com | no-pdf-link         |  13746
-     www.tandfonline.com                  | no-pdf-link         |  13303
-     aasldpubs.onlinelibrary.wiley.com    | no-pdf-link         |  11070
-     link.springer.com                    | redirect-loop       |  10594
-     www.redalyc.org:9081                 | no-pdf-link         |  10515
-     watermark.silverchair.com            | terminal-bad-status |   9739
-     www.bmj.com                          | link-loop           |   9389
-     www.repository.naturalis.nl          | redirect-loop       |   8213
-     bjp.rcpsych.org                      | link-loop           |   8045
-     aslopubs.onlinelibrary.wiley.com     | no-pdf-link         |   7814
-     nph.onlinelibrary.wiley.com          | no-pdf-link         |   7801
-     iopscience.iop.org                   | redirect-loop       |   7697
-     journals.tubitak.gov.tr              | wrong-mimetype      |   7159
-     www.biorxiv.org                      | wrong-mimetype      |   7067
-     www.erudit.org                       | redirect-loop       |   6819
-     besjournals.onlinelibrary.wiley.com  | no-pdf-link         |   6254
-    (30 rows)
-
-Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
-- dialnet.unirioja.es | redirect-loop | 240967
-- www.researchgate.net | redirect-loop |  42859
-- www.redalyc.org:9081 | no-pdf-link |  10515
-- www.repository.naturalis.nl | redirect-loop | 8213
-- bjp.rcpsych.org | link-loop | 8045
-- journals.tubitak.gov.tr | wrong-mimetype | 7159
-- www.erudit.org | redirect-loop | 6819
-
-The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
-
-Top uncrawled domains:
-
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status = 'no-capture'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                 domain              |   status   | count  
-    ---------------------------------+------------+--------
-     ieeexplore.ieee.org             | no-capture | 957835
-     link.springer.com               | no-capture | 394121
-     www.researchgate.net            | no-capture | 376974
-     cyberleninka.ru                 | no-capture | 376012
-     iopscience.iop.org              | no-capture | 348791
-     papers.ssrn.com                 | no-capture | 286860
-     dergipark.org.tr                | no-capture | 217556
-     dialnet.unirioja.es             | no-capture | 214398
-     academic.oup.com                | no-capture | 212364
-     www.tandfonline.com             | no-capture | 148940
-     journals.sagepub.com            | no-capture | 144695
-     www.papersearch.net             | no-capture | 138986
-     absimage.aps.org                | no-capture | 111976
-     apps.dtic.mil                   | no-capture | 106984
-     www.cambridge.org               | no-capture |  97533
-     www.bmj.com                     | no-capture |  92437
-     bioone.org                      | no-capture |  87573
-     science.sciencemag.org          | no-capture |  75723
-     shodhganga.inflibnet.ac.in:8080 | no-capture |  75395
-     www.jstor.org                   | no-capture |  73230
-     works.bepress.com               | no-capture |  68747
-     www.scielo.org.co               | no-capture |  59650
-     hrcak.srce.hr                   | no-capture |  59332
-     muse.jhu.edu                    | no-capture |  57828
-     onlinelibrary.wiley.com         | no-capture |  55621
-     www.jbc.org                     | no-capture |  54608
-     www.jstage.jst.go.jp            | no-capture |  53631
-     www.redalyc.org                 | no-capture |  50406
-     lup.lub.lu.se                   | no-capture |  47469
-     www.dtic.mil                    | no-capture |  41820
-    (30 rows)
-
-## Heritrix Seedlist Generation
-
-Dump ingest requests (filtered for some domains that don't expect to crawl via
-heritrix):
-
-    COPY (  
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-            AND ingest_file_result.status = 'no-capture'
-            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
-            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
-            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
-            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
-            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
-            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
-            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
-    ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
-    => COPY 11714199
-
-    # in sandcrawler pipenv
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
-
-## Bulk Ingest of Heritrix Content
-
-Small sample:
-
-    head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Full run:
-
-    cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-    2020-04-07 12:19 (pacific): 11,703,871
-
-## Post-bulk-ingest
-
-Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
-to re-try things like cdx-error.
-
-Current status:
-
-                status             |  count
-    -------------------------------+----------
-     success                       | 18491799
-     redirect-loop                 |  1968530
-     no-capture                    |  1373657
-     no-pdf-link                   |  1311842
-     link-loop                     |  1296439
-     terminal-bad-status           |   627577
-     cdx-error                     |   418278
-     wrong-mimetype                |    50141
-     wayback-error                 |    37159
-     petabox-error                 |    11249
-     null-body                     |     6295
-     gateway-timeout               |     3051
-     spn2-cdx-lookup-failure       |      328
-     spn2-error:invalid-url-syntax |       93
-     bad-redirect                  |       75
-                                   |       47
-     invalid-host-resolution       |       28
-     spn2-error                    |       10
-     bad-gzip-encoding             |        7
-     redirects-exceeded            |        2
-    (20 rows)
-
-Lots of cdx-error to retry.
-
-The no-capture links are probably a mix of domain-blocklist and things that
-failed in bulk mode. Will dump and re-attempt them:
-
-
-    COPY (  
-        SELECT row_to_json(ingest_request.*) FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-            AND ingest_file_result.status = 'no-capture'
-            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
-            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
-            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
-            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
-            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
-            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
-            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
-    ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
-    => 859849
-
-What domains are these?
-
-    cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
-
-Let's filter down more:
-
-    cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
-
-    wc -l mag_nocapture_20200420.rows.filtered.json
-    423085 mag_nocapture_20200420.rows.filtered.json
-
-Ok, enqueue!
-
-    cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
-
-## Final Stats
-
-... for this round of ingest:
-
-    SELECT ingest_file_result.status, COUNT(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'mag'
-    GROUP BY status
-    ORDER BY COUNT DESC
-    LIMIT 20;
-
-
-                   status                |  count
-    -------------------------------------+----------
-     success                             | 18712849
-     redirect-loop                       |  2008110
-     no-pdf-link                         |  1337012
-     link-loop                           |  1326761
-     no-capture                          |  1030693
-     terminal-bad-status                 |   637143
-     gateway-timeout                     |   193194
-     cdx-error                           |   125907
-     spn2-cdx-lookup-failure             |    77842
-     wrong-mimetype                      |    50882
-     wayback-error                       |    40278
-     invalid-host-resolution             |    35201
-     petabox-error                       |    11254
-     null-body                           |     6485
-     spn2-error                          |     1643
-     spn2-error:job-failed               |      747
-     spn2-error:invalid-url-syntax       |      325
-     spn2-error:soft-time-limit-exceeded |      190
-     bad-redirect                        |       77
-                                         |       47
-    (20 rows)
-
-Failures by domain:
-
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_file_result.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'mag'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-
-                 domain              |       status        | count
-    ---------------------------------+---------------------+--------
-     ieeexplore.ieee.org             | redirect-loop       | 677712
-     cyberleninka.ru                 | link-loop           | 308390
-     papers.ssrn.com                 | link-loop           | 281804
-     ieeexplore.ieee.org             | link-loop           | 273559
-     dialnet.unirioja.es             | redirect-loop       | 240504
-     dialnet.unirioja.es             | terminal-bad-status | 232481
-     onlinelibrary.wiley.com         | no-pdf-link         | 220932
-     iopscience.iop.org              | terminal-bad-status | 172480
-     validate.perfdrive.com          | no-pdf-link         | 172312
-     link.springer.com               | redirect-loop       | 130398
-     agupubs.onlinelibrary.wiley.com | no-pdf-link         | 113382
-     iopscience.iop.org              | redirect-loop       | 105234
-     www.bmj.com                     | link-loop           | 100354
-     www.researchgate.net            | redirect-loop       |  84366
-     www.cambridge.org               | link-loop           |  83171
-     jamanetwork.com                 | no-pdf-link         |  75053
-     febs.onlinelibrary.wiley.com    | no-pdf-link         |  74872
-     www.jstor.org                   | redirect-loop       |  72059
-     journals.sagepub.com            | no-pdf-link         |  63028
-     science.sciencemag.org          | redirect-loop       |  62927
-     profile.thieme.de               | no-pdf-link         |  62406
-     cyberleninka.ru                 | redirect-loop       |  56733
-     link.springer.com               | link-loop           |  47608
-     physoc.onlinelibrary.wiley.com  | no-pdf-link         |  30180
-     science.sciencemag.org          | link-loop           |  29908
-     papers.ssrn.com                 | redirect-loop       |  27255
-     obgyn.onlinelibrary.wiley.com   | no-pdf-link         |  26789
-     www.computer.org                | no-pdf-link         |  26444
-     watermark.silverchair.com       | terminal-bad-status |  25934
-     www.nature.com                  | redirect-loop       |  25306
-    (30 rows)
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md
new file mode 100644
index 0000000..428ce05
--- /dev/null
+++ b/notes/ingest/2020-03_mag.md
@@ -0,0 +1,576 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+    export LC_ALL=C
+    cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+    => previously 25.6M
+    => 25.6M 2:29:43 [2.85k/s]
+
+    export LC_ALL=C
+    zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+    => 4.3M 0:25:45 [2.78k/s]
+
+    export LC_ALL=C
+    cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+    zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+    => 6,504,907
+
+    zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+    => 4,369,832
+
+    cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+    => previously 15,707,405
+    => 15,702,581
+
+    cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+    => 0
+    URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+    # small sample
+    head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+    Worker: Counter({'total': 10, 'skip-result-fields': 10})
+    JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+    => 4.3M 0:16:46 [4.27k/s]
+    Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+    JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+    => hit a bug on first attempt, which is why total/insert results don't match
+
+    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+    => 25.6M 2:21:54 [3.01k/s]
+    Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+    JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+After just PMID links:
+
+           status        |  count
+    ---------------------+---------
+                         | 3000115
+     success             | 1126881
+     no-capture          |   69459
+     terminal-bad-status |   30259
+     redirect-loop       |   11656
+     no-pdf-link         |    2836
+     wrong-mimetype      |    1456
+     link-loop           |    1259
+     wayback-error       |    1232
+     cdx-error           |     932
+     null-body           |      85
+     petabox-error       |      50
+     bad-redirect        |       1
+    (13 rows)
+
+After all links:
+
+    SELECT COUNT(*)
+    FROM ingest_request
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag';
+    => 25596563
+
+
+           status        |  count   
+    ---------------------+----------
+                         | 21130841
+     success             |  3915682
+     no-capture          |   391813
+     terminal-bad-status |    76488
+     redirect-loop       |    44202
+     wrong-mimetype      |    16418
+     no-pdf-link         |    10995
+     wayback-error       |     3679
+     cdx-error           |     3414
+     link-loop           |     2098
+     null-body           |      709
+     petabox-error       |      221
+     bad-gzip-encoding   |        2
+     bad-redirect        |        1
+    (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+    => COPY 21,130,841
+
+Transform and shuf:
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+    => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+    bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+    29007
+    bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+    34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+    head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+    cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+    cd /grande/snapshots/
+    zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+    cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    partner ingests (see above)
+    => 2020-03-05 12:49:   118,396
+    1056543 mag_noingest_20200305.ingest_request.split_00.json
+    => 2020-03-05 14:34: 1,055,224
+    => check on stats/ratios; filter by ingest update time?
+    1056542 mag_noingest_20200305.ingest_request.split_01.json
+    1056542 mag_noingest_20200305.ingest_request.split_02.json
+    1056542 mag_noingest_20200305.ingest_request.split_03.json
+    1056542 mag_noingest_20200305.ingest_request.split_04.json
+    1056542 mag_noingest_20200305.ingest_request.split_05.json
+    1056542 mag_noingest_20200305.ingest_request.split_06.json
+    1056542 mag_noingest_20200305.ingest_request.split_07.json
+    1056542 mag_noingest_20200305.ingest_request.split_08.json
+    1056542 mag_noingest_20200305.ingest_request.split_09.json
+    => 2020-03-05 18:04: 10,009,297
+    => 2020-03-06 16:53:  6,553,946
+    1056542 mag_noingest_20200305.ingest_request.split_10.json
+    1056542 mag_noingest_20200305.ingest_request.split_11.json
+    1056542 mag_noingest_20200305.ingest_request.split_12.json
+    1056542 mag_noingest_20200305.ingest_request.split_13.json
+    1056542 mag_noingest_20200305.ingest_request.split_14.json
+    1056542 mag_noingest_20200305.ingest_request.split_15.json
+    1056542 mag_noingest_20200305.ingest_request.split_16.json
+    1056542 mag_noingest_20200305.ingest_request.split_17.json
+    1056542 mag_noingest_20200305.ingest_request.split_18.json
+    1056542 mag_noingest_20200305.ingest_request.split_19.json
+    => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+           status        |  count
+    ---------------------+----------
+     no-capture          | 12237193
+     success             | 11991293
+     no-pdf-link         |   521691
+     redirect-loop       |   437192
+     terminal-bad-status |   231181
+     link-loop           |    92633
+     cdx-error           |    33631
+     wrong-mimetype      |    28638
+     wayback-error       |    19651
+     null-body           |     2682
+     petabox-error       |      727
+                         |       47
+     bad-redirect        |       44
+     bad-gzip-encoding   |        7
+    (14 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                    domain                |       status        | count  
+    --------------------------------------+---------------------+--------
+     dialnet.unirioja.es                  | redirect-loop       | 240967
+     onlinelibrary.wiley.com              | no-pdf-link         | 147696
+     agupubs.onlinelibrary.wiley.com      | no-pdf-link         |  72639
+     iopscience.iop.org                   | terminal-bad-status |  69591
+     febs.onlinelibrary.wiley.com         | no-pdf-link         |  49874
+     www.researchgate.net                 | redirect-loop       |  42859
+     journals.sagepub.com                 | no-pdf-link         |  27448
+     papers.ssrn.com                      | redirect-loop       |  27328
+     dialnet.unirioja.es                  | terminal-bad-status |  20320
+     physoc.onlinelibrary.wiley.com       | no-pdf-link         |  20232
+     science.sciencemag.org               | link-loop           |  17811
+     espace.library.uq.edu.au             | redirect-loop       |  17185
+     bpspubs.onlinelibrary.wiley.com      | no-pdf-link         |  15785
+     obgyn.onlinelibrary.wiley.com        | no-pdf-link         |  15301
+     anthrosource.onlinelibrary.wiley.com | no-pdf-link         |  13746
+     www.tandfonline.com                  | no-pdf-link         |  13303
+     aasldpubs.onlinelibrary.wiley.com    | no-pdf-link         |  11070
+     link.springer.com                    | redirect-loop       |  10594
+     www.redalyc.org:9081                 | no-pdf-link         |  10515
+     watermark.silverchair.com            | terminal-bad-status |   9739
+     www.bmj.com                          | link-loop           |   9389
+     www.repository.naturalis.nl          | redirect-loop       |   8213
+     bjp.rcpsych.org                      | link-loop           |   8045
+     aslopubs.onlinelibrary.wiley.com     | no-pdf-link         |   7814
+     nph.onlinelibrary.wiley.com          | no-pdf-link         |   7801
+     iopscience.iop.org                   | redirect-loop       |   7697
+     journals.tubitak.gov.tr              | wrong-mimetype      |   7159
+     www.biorxiv.org                      | wrong-mimetype      |   7067
+     www.erudit.org                       | redirect-loop       |   6819
+     besjournals.onlinelibrary.wiley.com  | no-pdf-link         |   6254
+    (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop |  42859
+- www.redalyc.org:9081 | no-pdf-link |  10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                 domain              |   status   | count  
+    ---------------------------------+------------+--------
+     ieeexplore.ieee.org             | no-capture | 957835
+     link.springer.com               | no-capture | 394121
+     www.researchgate.net            | no-capture | 376974
+     cyberleninka.ru                 | no-capture | 376012
+     iopscience.iop.org              | no-capture | 348791
+     papers.ssrn.com                 | no-capture | 286860
+     dergipark.org.tr                | no-capture | 217556
+     dialnet.unirioja.es             | no-capture | 214398
+     academic.oup.com                | no-capture | 212364
+     www.tandfonline.com             | no-capture | 148940
+     journals.sagepub.com            | no-capture | 144695
+     www.papersearch.net             | no-capture | 138986
+     absimage.aps.org                | no-capture | 111976
+     apps.dtic.mil                   | no-capture | 106984
+     www.cambridge.org               | no-capture |  97533
+     www.bmj.com                     | no-capture |  92437
+     bioone.org                      | no-capture |  87573
+     science.sciencemag.org          | no-capture |  75723
+     shodhganga.inflibnet.ac.in:8080 | no-capture |  75395
+     www.jstor.org                   | no-capture |  73230
+     works.bepress.com               | no-capture |  68747
+     www.scielo.org.co               | no-capture |  59650
+     hrcak.srce.hr                   | no-capture |  59332
+     muse.jhu.edu                    | no-capture |  57828
+     onlinelibrary.wiley.com         | no-capture |  55621
+     www.jbc.org                     | no-capture |  54608
+     www.jstage.jst.go.jp            | no-capture |  53631
+     www.redalyc.org                 | no-capture |  50406
+     lup.lub.lu.se                   | no-capture |  47469
+     www.dtic.mil                    | no-capture |  41820
+    (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+    => COPY 11714199
+
+    # in sandcrawler pipenv
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
+
+## Bulk Ingest of Heritrix Content
+
+Small sample:
+
+    head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+    cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    2020-04-07 12:19 (pacific): 11,703,871
+
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+                status             |  count
+    -------------------------------+----------
+     success                       | 18491799
+     redirect-loop                 |  1968530
+     no-capture                    |  1373657
+     no-pdf-link                   |  1311842
+     link-loop                     |  1296439
+     terminal-bad-status           |   627577
+     cdx-error                     |   418278
+     wrong-mimetype                |    50141
+     wayback-error                 |    37159
+     petabox-error                 |    11249
+     null-body                     |     6295
+     gateway-timeout               |     3051
+     spn2-cdx-lookup-failure       |      328
+     spn2-error:invalid-url-syntax |       93
+     bad-redirect                  |       75
+                                   |       47
+     invalid-host-resolution       |       28
+     spn2-error                    |       10
+     bad-gzip-encoding             |        7
+     redirects-exceeded            |        2
+    (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+    => 859849
+
+What domains are these?
+
+    cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+    cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+    wc -l mag_nocapture_20200420.rows.filtered.json
+    423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+    cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Final Stats
+
+... for this round of ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+                   status                |  count
+    -------------------------------------+----------
+     success                             | 18712849
+     redirect-loop                       |  2008110
+     no-pdf-link                         |  1337012
+     link-loop                           |  1326761
+     no-capture                          |  1030693
+     terminal-bad-status                 |   637143
+     gateway-timeout                     |   193194
+     cdx-error                           |   125907
+     spn2-cdx-lookup-failure             |    77842
+     wrong-mimetype                      |    50882
+     wayback-error                       |    40278
+     invalid-host-resolution             |    35201
+     petabox-error                       |    11254
+     null-body                           |     6485
+     spn2-error                          |     1643
+     spn2-error:job-failed               |      747
+     spn2-error:invalid-url-syntax       |      325
+     spn2-error:soft-time-limit-exceeded |      190
+     bad-redirect                        |       77
+                                         |       47
+    (20 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+                 domain              |       status        | count
+    ---------------------------------+---------------------+--------
+     ieeexplore.ieee.org             | redirect-loop       | 677712
+     cyberleninka.ru                 | link-loop           | 308390
+     papers.ssrn.com                 | link-loop           | 281804
+     ieeexplore.ieee.org             | link-loop           | 273559
+     dialnet.unirioja.es             | redirect-loop       | 240504
+     dialnet.unirioja.es             | terminal-bad-status | 232481
+     onlinelibrary.wiley.com         | no-pdf-link         | 220932
+     iopscience.iop.org              | terminal-bad-status | 172480
+     validate.perfdrive.com          | no-pdf-link         | 172312
+     link.springer.com               | redirect-loop       | 130398
+     agupubs.onlinelibrary.wiley.com | no-pdf-link         | 113382
+     iopscience.iop.org              | redirect-loop       | 105234
+     www.bmj.com                     | link-loop           | 100354
+     www.researchgate.net            | redirect-loop       |  84366
+     www.cambridge.org               | link-loop           |  83171
+     jamanetwork.com                 | no-pdf-link         |  75053
+     febs.onlinelibrary.wiley.com    | no-pdf-link         |  74872
+     www.jstor.org                   | redirect-loop       |  72059
+     journals.sagepub.com            | no-pdf-link         |  63028
+     science.sciencemag.org          | redirect-loop       |  62927
+     profile.thieme.de               | no-pdf-link         |  62406
+     cyberleninka.ru                 | redirect-loop       |  56733
+     link.springer.com               | link-loop           |  47608
+     physoc.onlinelibrary.wiley.com  | no-pdf-link         |  30180
+     science.sciencemag.org          | link-loop           |  29908
+     papers.ssrn.com                 | redirect-loop       |  27255
+     obgyn.onlinelibrary.wiley.com   | no-pdf-link         |  26789
+     www.computer.org                | no-pdf-link         |  26444
+     watermark.silverchair.com       | terminal-bad-status |  25934
+     www.nature.com                  | redirect-loop       |  25306
+    (30 rows)
diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md
new file mode 100644
index 0000000..fedaba0
--- /dev/null
+++ b/notes/ingest/2020-03_s2.md
@@ -0,0 +1,35 @@
+
+Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
+ingested, as well as any previous existing content.
+
+Also, there are a bunch of PDF outlinks to the web; should do S2-specific
+matching and ingest of those.
+
+There are a few categories of paper from pdfs.s.o:
+
+1. we had previous GWB crawl, didn't re-crawl
+2. we had PDF from elsewhere on the web, didn't re-crawl
+3. crawled successfully
+4. crawl failed
+
+In this ingest, want to get all of categories 1 and 3. Could try to do this by
+dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
+and join that against the ingest request list.
+
+For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
+
+## Create Seedlist
+
+    zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
+    zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
+
+    zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
+    zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
+
+    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
+    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
+
+    zcat s2_external_ingestrequest.json.gz | wc -l
+    41201427
+    zcat s2_hosted_ingestrequest.json.gz | wc -l
+    23345761
diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2_ingest.md
deleted file mode 100644
index fedaba0..0000000
--- a/notes/ingest/2020-03_s2_ingest.md
+++ /dev/null
@@ -1,35 +0,0 @@
-
-Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
-ingested, as well as any previous existing content.
-
-Also, there are a bunch of PDF outlinks to the web; should do S2-specific
-matching and ingest of those.
-
-There are a few categories of paper from pdfs.s.o:
-
-1. we had previous GWB crawl, didn't re-crawl
-2. we had PDF from elsewhere on the web, didn't re-crawl
-3. crawled successfully
-4. crawl failed
-
-In this ingest, want to get all of categories 1 and 3. Could try to do this by
-dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
-and join that against the ingest request list.
-
-For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
-
-## Create Seedlist
-
-    zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
-    zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
-
-    zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
-    zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
-
-    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
-    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
-
-    zcat s2_external_ingestrequest.json.gz | wc -l
-    41201427
-    zcat s2_hosted_ingestrequest.json.gz | wc -l
-    23345761
diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md
deleted file mode 100644
index 0fc7e67..0000000
--- a/notes/ingest/2020-04-07_datacite.md
+++ /dev/null
@@ -1,121 +0,0 @@
-
-After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
-of the DOIs are for, eg, datasets, and don't want to waste time on those.
-
-Instead of using full ingest request file from the crawl, will generate a new
-ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
-
-## Generate Requests
-
-    ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
-    => Expecting 8905453 release objects in search queries
-    => 8.91M 11:49:50 [ 209 /s]
-    => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
-
-## Bulk Ingest
-
-    cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Ingest Stats
-
-Note that this will have a small fraction of non-datacite results mixed in (eg,
-from COVID-19 targeted crawls):
-
-    SELECT ingest_file_result.status, COUNT(*)
-    FROM ingest_request
-    LEFT JOIN ingest_file_result
-        ON ingest_file_result.ingest_type = ingest_request.ingest_type
-        AND ingest_file_result.base_url = ingest_request.base_url
-    WHERE 
-        ingest_request.ingest_type = 'pdf'
-        AND ingest_request.link_source = 'doi'
-        AND ingest_request.ingest_type = 'pdf'
-        AND ingest_request.ingest_request_source = 'fatcat-ingest'
-        AND created >= '2020-04-07'
-    GROUP BY status
-    ORDER BY COUNT DESC
-    LIMIT 20;
-
-                   status                |  count
-    -------------------------------------+---------
-     no-pdf-link                         | 4646767
-     redirect-loop                       | 1447229
-     no-capture                          |  860235
-     success                             |  849501
-     terminal-bad-status                 |  174869
-     cdx-error                           |  159805
-     wayback-error                       |   18076
-     wrong-mimetype                      |   11169
-     link-loop                           |    8410
-     gateway-timeout                     |    4034
-     spn2-cdx-lookup-failure             |     510
-     petabox-error                       |     339
-     null-body                           |     251
-     spn2-error                          |      19
-     spn2-error:job-failed               |      14
-     bad-gzip-encoding                   |      13
-     timeout                             |       5
-     spn2-error:soft-time-limit-exceeded |       4
-     invalid-host-resolution             |       2
-     spn2-error:pending                  |       1
-    (20 rows)
-
-Top domains/statuses (including success):
-
-    SELECT domain, status, COUNT((domain, status))
-    FROM (
-        SELECT
-            ingest_file_result.ingest_type,
-            ingest_file_result.status,
-            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
-        FROM ingest_file_result
-        LEFT JOIN ingest_request
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE 
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'doi'
-            AND ingest_request.ingest_type = 'pdf'
-            AND ingest_request.ingest_request_source = 'fatcat-ingest'
-            AND created >= '2020-04-07'
-    ) t1
-    WHERE t1.domain != ''
-        AND t1.status != 'success'
-    GROUP BY domain, status
-    ORDER BY COUNT DESC
-    LIMIT 30;
-
-                    domain                 |       status        | count
-    ---------------------------------------+---------------------+--------
-     ssl.fao.org                           | no-pdf-link         | 862277
-     www.e-periodica.ch                    | no-pdf-link         | 746781
-     www.researchgate.net                  | redirect-loop       | 664524
-     dlc.library.columbia.edu              | no-pdf-link         | 493111
-     www.die-bonn.de                       | redirect-loop       | 352903
-     figshare.com                          | no-pdf-link         | 319709
-     statisticaldatasets.data-planet.com   | no-pdf-link         | 309584
-     catalog.paradisec.org.au              | redirect-loop       | 225396
-     zenodo.org                            | no-capture          | 193201
-     digi.ub.uni-heidelberg.de             | no-pdf-link         | 184974
-     open.library.ubc.ca                   | no-pdf-link         | 167841
-     zenodo.org                            | no-pdf-link         | 130617
-     www.google.com                        | no-pdf-link         | 111312
-     www.e-manuscripta.ch                  | no-pdf-link         |  79192
-     ds.iris.edu                           | no-pdf-link         |  77649
-     data.inra.fr                          | no-pdf-link         |  69440
-     www.tib.eu                            | no-pdf-link         |  63872
-     www.egms.de                           | redirect-loop       |  53877
-     archaeologydataservice.ac.uk          | redirect-loop       |  52838
-     d.lib.msu.edu                         | no-pdf-link         |  45297
-     www.e-rara.ch                         | no-pdf-link         |  45163
-     springernature.figshare.com           | no-pdf-link         |  42527
-     boris.unibe.ch                        | no-pdf-link         |  40816
-     www.research-collection.ethz.ch       | no-capture          |  40350
-     spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link         |  33059
-     repository.dri.ie                     | terminal-bad-status |  32760
-     othes.univie.ac.at                    | no-pdf-link         |  32558
-     repositories.lib.utexas.edu           | no-capture          |  31526
-     posterng.netkey.at                    | no-pdf-link         |  30315
-     zenodo.org                            | terminal-bad-status |  29614
-    (30 rows)
-
diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md
deleted file mode 100644
index e30d482..0000000
--- a/notes/ingest/2020-04-07_unpaywall.md
+++ /dev/null
@@ -1,63 +0,0 @@
-
-A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
-not released for more than a month).
-
-Primary goal is:
-
-- generate ingest requests for only *new* URLs
-- bulk ingest these new URLs
-- crawl any no-capture URLs from that batch
-- re-bulk-ingest the no-capture batch
-- analytics on failed ingests. eg, any particular domains that are failing to crawl
-
-This ingest pipeline was started on 2020-04-07 by bnewbold.
-
-## Transform and Load
-
-    # in sandcrawler pipenv on aitio
-    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
-    => 24.7M 5:17:03 [ 1.3k/s]
-
-    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
-    => 24.7M
-    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
-
-## Dump new URLs and Bulk Ingest
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND date(ingest_request.created) > '2020-04-01'
-            AND ingest_file_result.status IS NULL
-    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
-    => 3696189
-
-    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Dump no-capture
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND date(ingest_request.created) > '2020-04-01'
-            AND ingest_file_result.status = 'no-capture'
-            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
-            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
-            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
-            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
-            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
-            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
-            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
-    ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json';
diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md
new file mode 100644
index 0000000..0fc7e67
--- /dev/null
+++ b/notes/ingest/2020-04_datacite.md
@@ -0,0 +1,121 @@
+
+After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
+of the DOIs are for, eg, datasets, and don't want to waste time on those.
+
+Instead of using full ingest request file from the crawl, will generate a new
+ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
+
+## Generate Requests
+
+    ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
+    => Expecting 8905453 release objects in search queries
+    => 8.91M 11:49:50 [ 209 /s]
+    => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
+
+## Bulk Ingest
+
+    cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'doi'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.ingest_request_source = 'fatcat-ingest'
+        AND created >= '2020-04-07'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count
+    -------------------------------------+---------
+     no-pdf-link                         | 4646767
+     redirect-loop                       | 1447229
+     no-capture                          |  860235
+     success                             |  849501
+     terminal-bad-status                 |  174869
+     cdx-error                           |  159805
+     wayback-error                       |   18076
+     wrong-mimetype                      |   11169
+     link-loop                           |    8410
+     gateway-timeout                     |    4034
+     spn2-cdx-lookup-failure             |     510
+     petabox-error                       |     339
+     null-body                           |     251
+     spn2-error                          |      19
+     spn2-error:job-failed               |      14
+     bad-gzip-encoding                   |      13
+     timeout                             |       5
+     spn2-error:soft-time-limit-exceeded |       4
+     invalid-host-resolution             |       2
+     spn2-error:pending                  |       1
+    (20 rows)
+
+Top domains/statuses (including success):
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'doi'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.ingest_request_source = 'fatcat-ingest'
+            AND created >= '2020-04-07'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                    domain                 |       status        | count
+    ---------------------------------------+---------------------+--------
+     ssl.fao.org                           | no-pdf-link         | 862277
+     www.e-periodica.ch                    | no-pdf-link         | 746781
+     www.researchgate.net                  | redirect-loop       | 664524
+     dlc.library.columbia.edu              | no-pdf-link         | 493111
+     www.die-bonn.de                       | redirect-loop       | 352903
+     figshare.com                          | no-pdf-link         | 319709
+     statisticaldatasets.data-planet.com   | no-pdf-link         | 309584
+     catalog.paradisec.org.au              | redirect-loop       | 225396
+     zenodo.org                            | no-capture          | 193201
+     digi.ub.uni-heidelberg.de             | no-pdf-link         | 184974
+     open.library.ubc.ca                   | no-pdf-link         | 167841
+     zenodo.org                            | no-pdf-link         | 130617
+     www.google.com                        | no-pdf-link         | 111312
+     www.e-manuscripta.ch                  | no-pdf-link         |  79192
+     ds.iris.edu                           | no-pdf-link         |  77649
+     data.inra.fr                          | no-pdf-link         |  69440
+     www.tib.eu                            | no-pdf-link         |  63872
+     www.egms.de                           | redirect-loop       |  53877
+     archaeologydataservice.ac.uk          | redirect-loop       |  52838
+     d.lib.msu.edu                         | no-pdf-link         |  45297
+     www.e-rara.ch                         | no-pdf-link         |  45163
+     springernature.figshare.com           | no-pdf-link         |  42527
+     boris.unibe.ch                        | no-pdf-link         |  40816
+     www.research-collection.ethz.ch       | no-capture          |  40350
+     spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link         |  33059
+     repository.dri.ie                     | terminal-bad-status |  32760
+     othes.univie.ac.at                    | no-pdf-link         |  32558
+     repositories.lib.utexas.edu           | no-capture          |  31526
+     posterng.netkey.at                    | no-pdf-link         |  30315
+     zenodo.org                            | terminal-bad-status |  29614
+    (30 rows)
+
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..bce757b
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,129 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+    => 24.7M 5:17:03 [ 1.3k/s]
+
+    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => 24.7M
+    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+    => 25.2M 3:16:28 [2.14k/s]
+
+    cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+    => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+    => 3696189
+
+    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-05-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+    => 1799760
+
+    cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+    => 2734145
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+    => 2602408
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+Test small batch:
+
+    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..4cfd8d5
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,125 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+    => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+    zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+    => 16,013,503
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+    => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+    zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+    => 80M 6:36:55 [3.36k/s]
+
+    time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+    => 80M 4:00:21 [5.55k/s]
+    => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+    => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+    => real    240m21.207s
+    => user    85m12.576s
+    => sys     3m29.580s
+
+    select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+    => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+    zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+    => 51,185,088
+
+    zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+    wc -l request_url.txt
+    => 50,002,674 request_url.txt
+
+    zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+    wc -l requires_oai.txt
+    => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+----------
+                             | 49491452
+     success                 |  1469113
+     no-capture              |   134611
+     redirect-loop           |    59666
+     no-pdf-link             |     8947
+     cdx-error               |     7561
+     terminal-bad-status     |     6704
+     null-body               |     5042
+     wrong-mimetype          |      879
+     wayback-error           |      722
+     petabox-error           |      198
+     gateway-timeout         |       86
+     link-loop               |       51
+     invalid-host-resolution |       24
+     spn2-cdx-lookup-failure |       22
+     spn2-error              |        4
+     bad-gzip-encoding       |        4
+     spn2-error:job-failed   |        2
+    (18 rows)
+
+Dump ingest requests:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2020-05-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+    => COPY 49491452
+
+    cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/20200114_bulk_ingests.md
deleted file mode 100644
index 9d05cda..0000000
--- a/notes/ingest/20200114_bulk_ingests.md
+++ /dev/null
@@ -1,26 +0,0 @@
-
-Generate ingest requests from arabesque:
-
-    zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json
-
-    zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json
-
-
-Quick tests locally:
-
-    time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json
-    time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json
-
-These are all wayback success; looking good! Single threaded, from home laptop
-(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even
-with 30x parallelism. Should re-test on actual server. GROBID pre-check should
-help?
-
-With new bulk topic:
-
-    head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-Ok, let them rip:
-
-    cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-    cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-- 
cgit v1.2.3