12 files changed, 620 insertions, 81 deletions
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/2019-10-23_testing.md
index 481c4e2..481c4e2 100644
--- a/notes/ingest/20191023_testing.md
+++ b/notes/ingest/2019-10-23_testing.md
diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/2020-01-14_bulk.md
index 9d05cda..9d05cda 100644
--- a/notes/ingest/20200114_bulk_ingests.md
+++ b/notes/ingest/2020-01-14_bulk.md
diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02_unpaywall.md
index e18a2ff..e18a2ff 100644
--- a/notes/ingest/2020-02-14_unpaywall_ingest.md
+++ b/notes/ingest/2020-02_unpaywall.md
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03_mag.md
index 97594c8..428ce05 100644
--- a/notes/ingest/2020-03-04_mag.md
+++ b/notes/ingest/2020-03_mag.md
@@ -406,3 +406,171 @@ Full run:
 
     2020-04-07 12:19 (pacific): 11,703,871
 
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+                status             |  count
+    -------------------------------+----------
+     success                       | 18491799
+     redirect-loop                 |  1968530
+     no-capture                    |  1373657
+     no-pdf-link                   |  1311842
+     link-loop                     |  1296439
+     terminal-bad-status           |   627577
+     cdx-error                     |   418278
+     wrong-mimetype                |    50141
+     wayback-error                 |    37159
+     petabox-error                 |    11249
+     null-body                     |     6295
+     gateway-timeout               |     3051
+     spn2-cdx-lookup-failure       |      328
+     spn2-error:invalid-url-syntax |       93
+     bad-redirect                  |       75
+                                   |       47
+     invalid-host-resolution       |       28
+     spn2-error                    |       10
+     bad-gzip-encoding             |        7
+     redirects-exceeded            |        2
+    (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+    => 859849
+
+What domains are these?
+
+    cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+    cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+    wc -l mag_nocapture_20200420.rows.filtered.json
+    423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+    cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Final Stats
+
+... for this round of ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+                   status                |  count
+    -------------------------------------+----------
+     success                             | 18712849
+     redirect-loop                       |  2008110
+     no-pdf-link                         |  1337012
+     link-loop                           |  1326761
+     no-capture                          |  1030693
+     terminal-bad-status                 |   637143
+     gateway-timeout                     |   193194
+     cdx-error                           |   125907
+     spn2-cdx-lookup-failure             |    77842
+     wrong-mimetype                      |    50882
+     wayback-error                       |    40278
+     invalid-host-resolution             |    35201
+     petabox-error                       |    11254
+     null-body                           |     6485
+     spn2-error                          |     1643
+     spn2-error:job-failed               |      747
+     spn2-error:invalid-url-syntax       |      325
+     spn2-error:soft-time-limit-exceeded |      190
+     bad-redirect                        |       77
+                                         |       47
+    (20 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+                 domain              |       status        | count
+    ---------------------------------+---------------------+--------
+     ieeexplore.ieee.org             | redirect-loop       | 677712
+     cyberleninka.ru                 | link-loop           | 308390
+     papers.ssrn.com                 | link-loop           | 281804
+     ieeexplore.ieee.org             | link-loop           | 273559
+     dialnet.unirioja.es             | redirect-loop       | 240504
+     dialnet.unirioja.es             | terminal-bad-status | 232481
+     onlinelibrary.wiley.com         | no-pdf-link         | 220932
+     iopscience.iop.org              | terminal-bad-status | 172480
+     validate.perfdrive.com          | no-pdf-link         | 172312
+     link.springer.com               | redirect-loop       | 130398
+     agupubs.onlinelibrary.wiley.com | no-pdf-link         | 113382
+     iopscience.iop.org              | redirect-loop       | 105234
+     www.bmj.com                     | link-loop           | 100354
+     www.researchgate.net            | redirect-loop       |  84366
+     www.cambridge.org               | link-loop           |  83171
+     jamanetwork.com                 | no-pdf-link         |  75053
+     febs.onlinelibrary.wiley.com    | no-pdf-link         |  74872
+     www.jstor.org                   | redirect-loop       |  72059
+     journals.sagepub.com            | no-pdf-link         |  63028
+     science.sciencemag.org          | redirect-loop       |  62927
+     profile.thieme.de               | no-pdf-link         |  62406
+     cyberleninka.ru                 | redirect-loop       |  56733
+     link.springer.com               | link-loop           |  47608
+     physoc.onlinelibrary.wiley.com  | no-pdf-link         |  30180
+     science.sciencemag.org          | link-loop           |  29908
+     papers.ssrn.com                 | redirect-loop       |  27255
+     obgyn.onlinelibrary.wiley.com   | no-pdf-link         |  26789
+     www.computer.org                | no-pdf-link         |  26444
+     watermark.silverchair.com       | terminal-bad-status |  25934
+     www.nature.com                  | redirect-loop       |  25306
+    (30 rows)
diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2.md
index fedaba0..fedaba0 100644
--- a/notes/ingest/2020-03_s2_ingest.md
+++ b/notes/ingest/2020-03_s2.md
diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md
deleted file mode 100644
index b0217f0..0000000
--- a/notes/ingest/2020-04-07_datacite.md
+++ /dev/null
@@ -1,18 +0,0 @@
-
-After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
-of the DOIs are for, eg, datasets, and don't want to waste time on those.
-
-Instead of using full ingest request file from the crawl, will generate a new
-ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
-
-## Generate Requests
-
-    ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
-    => Expecting 8905453 release objects in search queries
-    => 8.91M 11:49:50 [ 209 /s]
-    => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
-
-## Bulk Ingest
-
-    cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md
deleted file mode 100644
index e30d482..0000000
--- a/notes/ingest/2020-04-07_unpaywall.md
+++ /dev/null
@@ -1,63 +0,0 @@
-
-A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
-not released for more than a month).
-
-Primary goal is:
-
-- generate ingest requests for only *new* URLs
-- bulk ingest these new URLs
-- crawl any no-capture URLs from that batch
-- re-bulk-ingest the no-capture batch
-- analytics on failed ingests. eg, any particular domains that are failing to crawl
-
-This ingest pipeline was started on 2020-04-07 by bnewbold.
-
-## Transform and Load
-
-    # in sandcrawler pipenv on aitio
-    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
-    => 24.7M 5:17:03 [ 1.3k/s]
-
-    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
-    => 24.7M
-    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
-
-## Dump new URLs and Bulk Ingest
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND date(ingest_request.created) > '2020-04-01'
-            AND ingest_file_result.status IS NULL
-    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
-    => 3696189
-
-    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Dump no-capture
-
-    COPY (
-        SELECT row_to_json(ingest_request.*)
-        FROM ingest_request
-        LEFT JOIN ingest_file_result
-            ON ingest_file_result.ingest_type = ingest_request.ingest_type
-            AND ingest_file_result.base_url = ingest_request.base_url
-        WHERE
-            ingest_request.ingest_type = 'pdf'
-            AND ingest_request.link_source = 'unpaywall'
-            AND date(ingest_request.created) > '2020-04-01'
-            AND ingest_file_result.status = 'no-capture'
-            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
-            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
-            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
-            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
-            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
-            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
-            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
-    ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json';
diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md
new file mode 100644
index 0000000..0fc7e67
--- /dev/null
+++ b/notes/ingest/2020-04_datacite.md
@@ -0,0 +1,121 @@
+
+After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
+of the DOIs are for, eg, datasets, and don't want to waste time on those.
+
+Instead of using full ingest request file from the crawl, will generate a new
+ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
+
+## Generate Requests
+
+    ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
+    => Expecting 8905453 release objects in search queries
+    => 8.91M 11:49:50 [ 209 /s]
+    => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
+
+## Bulk Ingest
+
+    cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'doi'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.ingest_request_source = 'fatcat-ingest'
+        AND created >= '2020-04-07'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count
+    -------------------------------------+---------
+     no-pdf-link                         | 4646767
+     redirect-loop                       | 1447229
+     no-capture                          |  860235
+     success                             |  849501
+     terminal-bad-status                 |  174869
+     cdx-error                           |  159805
+     wayback-error                       |   18076
+     wrong-mimetype                      |   11169
+     link-loop                           |    8410
+     gateway-timeout                     |    4034
+     spn2-cdx-lookup-failure             |     510
+     petabox-error                       |     339
+     null-body                           |     251
+     spn2-error                          |      19
+     spn2-error:job-failed               |      14
+     bad-gzip-encoding                   |      13
+     timeout                             |       5
+     spn2-error:soft-time-limit-exceeded |       4
+     invalid-host-resolution             |       2
+     spn2-error:pending                  |       1
+    (20 rows)
+
+Top domains/statuses (including success):
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'doi'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.ingest_request_source = 'fatcat-ingest'
+            AND created >= '2020-04-07'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                    domain                 |       status        | count
+    ---------------------------------------+---------------------+--------
+     ssl.fao.org                           | no-pdf-link         | 862277
+     www.e-periodica.ch                    | no-pdf-link         | 746781
+     www.researchgate.net                  | redirect-loop       | 664524
+     dlc.library.columbia.edu              | no-pdf-link         | 493111
+     www.die-bonn.de                       | redirect-loop       | 352903
+     figshare.com                          | no-pdf-link         | 319709
+     statisticaldatasets.data-planet.com   | no-pdf-link         | 309584
+     catalog.paradisec.org.au              | redirect-loop       | 225396
+     zenodo.org                            | no-capture          | 193201
+     digi.ub.uni-heidelberg.de             | no-pdf-link         | 184974
+     open.library.ubc.ca                   | no-pdf-link         | 167841
+     zenodo.org                            | no-pdf-link         | 130617
+     www.google.com                        | no-pdf-link         | 111312
+     www.e-manuscripta.ch                  | no-pdf-link         |  79192
+     ds.iris.edu                           | no-pdf-link         |  77649
+     data.inra.fr                          | no-pdf-link         |  69440
+     www.tib.eu                            | no-pdf-link         |  63872
+     www.egms.de                           | redirect-loop       |  53877
+     archaeologydataservice.ac.uk          | redirect-loop       |  52838
+     d.lib.msu.edu                         | no-pdf-link         |  45297
+     www.e-rara.ch                         | no-pdf-link         |  45163
+     springernature.figshare.com           | no-pdf-link         |  42527
+     boris.unibe.ch                        | no-pdf-link         |  40816
+     www.research-collection.ethz.ch       | no-capture          |  40350
+     spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link         |  33059
+     repository.dri.ie                     | terminal-bad-status |  32760
+     othes.univie.ac.at                    | no-pdf-link         |  32558
+     repositories.lib.utexas.edu           | no-capture          |  31526
+     posterng.netkey.at                    | no-pdf-link         |  30315
+     zenodo.org                            | terminal-bad-status |  29614
+    (30 rows)
+
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..bce757b
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,129 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+    => 24.7M 5:17:03 [ 1.3k/s]
+
+    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => 24.7M
+    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+    => 25.2M 3:16:28 [2.14k/s]
+
+    cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+    => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+    => 3696189
+
+    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-05-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+    => 1799760
+
+    cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+    => 2734145
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND date(ingest_request.created) > '2020-04-01'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+    => 2602408
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+Test small batch:
+
+    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..4cfd8d5
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,125 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+    => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+    zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+    => 16,013,503
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+    => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+    zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+    => 80M 6:36:55 [3.36k/s]
+
+    time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+    => 80M 4:00:21 [5.55k/s]
+    => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+    => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+    => real    240m21.207s
+    => user    85m12.576s
+    => sys     3m29.580s
+
+    select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+    => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+    zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+    => 51,185,088
+
+    zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+    wc -l request_url.txt
+    => 50,002,674 request_url.txt
+
+    zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+    wc -l requires_oai.txt
+    => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+----------
+                             | 49491452
+     success                 |  1469113
+     no-capture              |   134611
+     redirect-loop           |    59666
+     no-pdf-link             |     8947
+     cdx-error               |     7561
+     terminal-bad-status     |     6704
+     null-body               |     5042
+     wrong-mimetype          |      879
+     wayback-error           |      722
+     petabox-error           |      198
+     gateway-timeout         |       86
+     link-loop               |       51
+     invalid-host-resolution |       24
+     spn2-cdx-lookup-failure |       22
+     spn2-error              |        4
+     bad-gzip-encoding       |        4
+     spn2-error:job-failed   |        2
+    (18 rows)
+
+Dump ingest requests:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2020-05-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+    => COPY 49491452
+
+    cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md
new file mode 100644
index 0000000..8cdd6df
--- /dev/null
+++ b/notes/ingest/NEXT.md
@@ -0,0 +1,52 @@
+
+biorxiv
+medrxiv
+    doi:10.1101\/20*
+
+persee.fr 147k
+    publisher:persee in_ia:false is_oa:true
+    https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013
+
+cairn.info: 161k
+    doi_prefix:10.3917 in_ia:false is_oa:true
+    https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm
+    https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm
+
+IOP OA: 169k
+    doi_prefix:10.1088 is_oa:true in_ia:false
+
+indian journals platform? 124k
+    doi_prefix:10.4103 in_ia:false is_oa:true
+    http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad
+    http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki
+
+openedition? 48k
+    doi_prefix:10.4000 is_oa:true in_ia:false
+
+german medical science (GMS) 28k
+    doi_prefix:10.3205 in_ia:false is_oa:true
+    https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml
+
+siberian chemistry 28k
+    doi_prefix:10.2298 in_ia:false is_oa:true
+    http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H
+
+jalc oa doi: 82k
+    doi_registrar:jalc in_ia:false is_oa:true
+
+sage OA papers
+    https://journals.sagepub.com/doi/10.1177/034003529802400510
+
+Scientific Reports: 25k
+    in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm"
+
+U Toronto press: 23k
+    publisher:"Toronto Press" in_ia:false is_oa:true
+    has an annoying bounce page
+
+ASHA (speech-language-hearing association): 7k
+    publisher:Speech-Language-Hearing in_ia:false is_oa:true
+
+MIT press journals
+
+