From 5dd8785d710cf7d067afdc691069bfa74406e06a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 May 2020 14:47:17 -0700
Subject: ingests: normalize file names; commit updates

---
 notes/ingest/2020-02_unpaywall.md | 624 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 624 insertions(+)
 create mode 100644 notes/ingest/2020-02_unpaywall.md

(limited to 'notes/ingest/2020-02_unpaywall.md')

diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md
new file mode 100644
index 0000000..e18a2ff
--- /dev/null
+++ b/notes/ingest/2020-02_unpaywall.md
@@ -0,0 +1,624 @@
+
+## Stats and Things
+
+    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt
+
+## Transform
+
+    zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null
+    => 22M 1:31:25 [   4k/s]
+
+Shard it into batches of roughly 1 million (all are 1098096 +/- 1):
+
+    zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json
+
+Test ingest:
+
+    head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Add a single batch like:
+
+    cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Progress/Status
+
+There are 21,961,928 lines total, in batches of 1,098,097.
+
+    unpaywall_snapshot_2019-11-22.ingest_request.split_00.json
+        => 2020-02-24 21:05 local: 1,097,523    ~22 results/sec (combined)
+        => 2020-02-25 10:35 local: 0
+    unpaywall_snapshot_2019-11-22.ingest_request.split_01.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_02.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_03.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_04.json
+        => 2020-02-25 11:26 local: 4,388,997
+        => 2020-02-25 10:14 local: 1,115,821
+        => 2020-02-26 16:00 local:   265,116
+    unpaywall_snapshot_2019-11-22.ingest_request.split_05.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_06.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_07.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_08.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_09.json
+        => 2020-02-26 16:01 local: 6,843,708
+        => 2020-02-26 16:31 local: 4,839,618
+        => 2020-02-28 10:30 local: 2,619,319
+    unpaywall_snapshot_2019-11-22.ingest_request.split_10.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_11.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_12.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_13.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_14.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_15.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_16.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_17.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_18.json
+    unpaywall_snapshot_2019-11-22.ingest_request.split_19.json
+        => 2020-02-28 10:50 local: 13,551,887
+        => 2020-03-01 23:38 local:  4,521,076
+        => 2020-03-02 10:45 local:  2,827,071
+        => 2020-03-02 21:06 local:  1,257,176
+    added about 500k bulk re-ingest to try and work around cdx errors
+        => 2020-03-02 21:30 local:  1,733,654
+
+## Investigate Failures
+
+Guessing than some domains are ultimately going to need direct "recrawl" via
+SPNv2.
+
+    -- top domain failures for unpaywall GWB history ingest
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                  domain               |       status        | count  
+    -----------------------------------+---------------------+--------
+     watermark.silverchair.com         | terminal-bad-status | 258432
+     www.tandfonline.com               | no-pdf-link         | 203873
+     journals.sagepub.com              | no-pdf-link         | 126317
+     iopscience.iop.org                | terminal-bad-status | 112526
+     files-journal-api.frontiersin.org | terminal-bad-status | 112499
+     pubs.acs.org                      | no-pdf-link         |  94772
+     www.degruyter.com                 | redirect-loop       |  89801
+     www.ahajournals.org               | no-pdf-link         |  84025
+     society.kisti.re.kr               | no-pdf-link         |  72849
+     www.nature.com                    | redirect-loop       |  53575
+     babel.hathitrust.org              | terminal-bad-status |  41063
+     www.ncbi.nlm.nih.gov              | redirect-loop       |  40363
+     scialert.net                      | no-pdf-link         |  38340
+     www.degruyter.com                 | terminal-bad-status |  34913
+     www.journal.csj.jp                | no-pdf-link         |  30881
+     espace.library.uq.edu.au          | redirect-loop       |  24570
+     www.jci.org                       | redirect-loop       |  24409
+     aip.scitation.org                 | wrong-mimetype      |  22144
+     www.vr-elibrary.de                | no-pdf-link         |  17436
+     www.biorxiv.org                   | wrong-mimetype      |  15524
+     ajph.aphapublications.org         | no-pdf-link         |  15083
+     zookeys.pensoft.net               | redirect-loop       |  14867
+     dialnet.unirioja.es               | redirect-loop       |  14486
+     asa.scitation.org                 | wrong-mimetype      |  14261
+     www.nrcresearchpress.com          | no-pdf-link         |  14254
+     dl.acm.org                        | redirect-loop       |  14223
+     osf.io                            | redirect-loop       |  14103
+     www.oecd-ilibrary.org             | redirect-loop       |  12835
+     journals.sagepub.com              | redirect-loop       |  12229
+     iopscience.iop.org                | redirect-loop       |  11825
+    (30 rows)
+
+    -- top no-capture terminal domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+    => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled
+
+    -- top no-capture base domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                domain            |   status   | count  
+    ------------------------------+------------+--------
+     academic.oup.com             | no-capture | 429888
+     www.nature.com               | no-capture | 273825
+     dergipark.org.tr             | no-capture | 119847
+     www.biodiversitylibrary.org  | no-capture | 110220
+     escholarship.org             | no-capture | 106307
+     onlinelibrary.wiley.com      | no-capture |  89771
+     journals.sagepub.com         | no-capture |  79297
+     www.cell.com                 | no-capture |  64242
+     deepblue.lib.umich.edu       | no-capture |  58080
+     babel.hathitrust.org         | no-capture |  52286
+     hal.archives-ouvertes.fr     | no-capture |  48549
+     iopscience.iop.org           | no-capture |  42591
+     dash.harvard.edu             | no-capture |  40767
+     www.tandfonline.com          | no-capture |  40638
+     discovery.ucl.ac.uk          | no-capture |  40633
+     www.jstage.jst.go.jp         | no-capture |  39780
+     www.doiserbia.nb.rs          | no-capture |  39261
+     dspace.mit.edu               | no-capture |  37703
+     zookeys.pensoft.net          | no-capture |  34562
+     repositorio.unesp.br         | no-capture |  34437
+     ashpublications.org          | no-capture |  34112
+     www.cambridge.org            | no-capture |  33959
+     kclpure.kcl.ac.uk            | no-capture |  31455
+     society.kisti.re.kr          | no-capture |  30427
+     pure.mpg.de                  | no-capture |  27650
+     download.atlantis-press.com  | no-capture |  27253
+     dialnet.unirioja.es          | no-capture |  26886
+     link.springer.com            | no-capture |  26257
+     www.valueinhealthjournal.com | no-capture |  24798
+     dspace.library.uu.nl         | no-capture |  23234
+    (30 rows)
+
+    -- top no-capture base domains
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                domain            |   status   | count
+    ------------------------------+------------+--------
+     academic.oup.com             | no-capture | 429888
+     www.nature.com               | no-capture | 273825
+     dergipark.org.tr             | no-capture | 119847
+     www.biodiversitylibrary.org  | no-capture | 110220
+     escholarship.org             | no-capture | 106307
+     onlinelibrary.wiley.com      | no-capture |  89771
+     journals.sagepub.com         | no-capture |  79297
+     www.cell.com                 | no-capture |  64242
+     deepblue.lib.umich.edu       | no-capture |  58080
+     babel.hathitrust.org         | no-capture |  52286
+     hal.archives-ouvertes.fr     | no-capture |  48549
+     iopscience.iop.org           | no-capture |  42591
+     dash.harvard.edu             | no-capture |  40767
+     www.tandfonline.com          | no-capture |  40638
+     discovery.ucl.ac.uk          | no-capture |  40633
+     www.jstage.jst.go.jp         | no-capture |  39780
+     www.doiserbia.nb.rs          | no-capture |  39261
+     dspace.mit.edu               | no-capture |  37703
+     zookeys.pensoft.net          | no-capture |  34562
+     repositorio.unesp.br         | no-capture |  34437
+     ashpublications.org          | no-capture |  34112
+     www.cambridge.org            | no-capture |  33959
+     kclpure.kcl.ac.uk            | no-capture |  31455
+     society.kisti.re.kr          | no-capture |  30427
+     pure.mpg.de                  | no-capture |  27650
+     download.atlantis-press.com  | no-capture |  27253
+     dialnet.unirioja.es          | no-capture |  26886
+     link.springer.com            | no-capture |  26257
+     www.valueinhealthjournal.com | no-capture |  24798
+     dspace.library.uu.nl         | no-capture |  23234
+    (30 rows)
+
+    -- how many ingest requests not crawled at all?
+    SELECT count(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status IS NULL;
+    => 0
+
+    -- "cookie absent" terminal pages, by domain
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                 domain             |     status     | count  
+    --------------------------------+----------------+--------
+     journals.sagepub.com           | no-pdf-link    | 126295
+     www.tandfonline.com            | no-pdf-link    | 116690
+     pubs.acs.org                   | no-pdf-link    |  94619
+     www.ahajournals.org            | no-pdf-link    |  84016
+     www.journal.csj.jp             | no-pdf-link    |  30881
+     aip.scitation.org              | wrong-mimetype |  22143
+     www.vr-elibrary.de             | no-pdf-link    |  17436
+     ajph.aphapublications.org      | no-pdf-link    |  15080
+     asa.scitation.org              | wrong-mimetype |  14261
+     www.nrcresearchpress.com       | no-pdf-link    |  14253
+     journals.ametsoc.org           | no-pdf-link    |  10500
+     www.journals.uchicago.edu      | no-pdf-link    |   6917
+     www.icevirtuallibrary.com      | no-pdf-link    |   6484
+     www.journals.uchicago.edu      | wrong-mimetype |   6191
+     www.healthaffairs.org          | no-pdf-link    |   5732
+     pubsonline.informs.org         | no-pdf-link    |   5672
+     pinnacle-secure.allenpress.com | no-pdf-link    |   5013
+     www.worldscientific.com        | no-pdf-link    |   4560
+     www.ajronline.org              | wrong-mimetype |   4523
+     ehp.niehs.nih.gov              | no-pdf-link    |   4514
+     www.future-science.com         | no-pdf-link    |   4091
+     pubs.acs.org                   | wrong-mimetype |   4015
+     aip.scitation.org              | no-pdf-link    |   3916
+     www.futuremedicine.com         | no-pdf-link    |   3821
+     asa.scitation.org              | no-pdf-link    |   3644
+     www.liebertpub.com             | no-pdf-link    |   3345
+     physicstoday.scitation.org     | no-pdf-link    |   3005
+     pubs.cif-ifc.org               | no-pdf-link    |   2761
+     epubs.siam.org                 | wrong-mimetype |   2583
+     www.ajronline.org              | no-pdf-link    |   2563
+    (30 rows)
+
+    -- "cookie absent" terminal pages, by domain
+    SELECT count(*)
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent';
+
+    => 654885
+
+    -- NOT "cookie absent" terminal page failures, total count
+    SELECT count(*)
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent';
+
+    => 1403837
+
+Looks like these domains are almost all "cookieAbsent" blocking:
+- journals.sagepub.com
+- pubs.acs.org
+- ahajournals.org
+- www.journal.csj.jp
+- aip.scitation.org
+
+Grab some individual URLs to test:
+
+    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ORDER BY updated DESC
+    LIMIT 25;
+
+NOT cookieAbsent testing with regular ingest tool:
+- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success
+- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes
+- osf.io success
+
+    SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_file_result.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND ingest_file_result.status != 'success'
+        AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ORDER BY updated DESC
+    LIMIT 25;
+
+cookieAbsent testing with regular ingest tool:
+- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works
+
+The main distinguisher is status. terminal-bad-status can be ingested (live)
+successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled.
+
+## Heritrix Plan
+
+Generate following ingest request batches:
+
+- no-capture status from unpaywall
+- all other failures except /cookieAbsent
+- /cookieAbsent failures
+
+Plan will be to crawl no-capture first (to completion), then try the other
+non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2.
+
+Because there are so few "no-capture on second hop" cases, will not enqueue
+both terminal urls and base urls, only base urls.
+
+Should definitely skip/filter:
+
+- www.ncbi.nlm.nih.gov
+
+## Ingest Request Export
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json';
+    => 4,855,142
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json';
+    => 1,403,837
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json
+
+Note: will probably end up re-running the below after crawling+ingesting the above:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.status = 'terminal-bad-status'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json';
+    => 0
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.status != 'terminal-bad-status'
+            AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
+    => 654,885
+
+## Batch Ingest
+
+Test small batch:
+
+    head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+    cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    # there was a broken line in there, so...
+    # parse error: Expected separator between values at line 1367873, column 175
+    # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
+    tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Note that the crawl is not entirely complete and not all CDX seem to have been
+loaded, so may need to iterate. About 10% are still "no capture". May want or
+need to additionally crawl the terminal URLs, not the base URLs.
+
+## Post-ingest stats
+
+Overall status:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+----------
+     success                 | 17354494
+     no-pdf-link             |  1471076
+     no-capture              |  1135992
+     redirect-loop           |   837842
+     terminal-bad-status     |   803081
+     cdx-error               |   219746
+     wrong-mimetype          |   100723
+     link-loop               |    16013
+     wayback-error           |    12448
+     null-body               |     9444
+     redirects-exceeded      |      600
+     petabox-error           |      411
+     bad-redirect            |       17
+     bad-gzip-encoding       |        4
+     spn2-cdx-lookup-failure |        3
+     gateway-timeout         |        1
+     spn2-error:job-failed   |        1
+     spn2-error              |        1
+    (18 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                  domain               |       status        | count
+    -----------------------------------+---------------------+--------
+     academic.oup.com                  | no-pdf-link         | 330211
+     watermark.silverchair.com         | terminal-bad-status | 324599
+     www.tandfonline.com               | no-pdf-link         | 242724
+     journals.sagepub.com              | no-pdf-link         | 202050
+     iopscience.iop.org                | terminal-bad-status | 144063
+     files-journal-api.frontiersin.org | terminal-bad-status | 121719
+     pubs.acs.org                      | no-pdf-link         | 104535
+     www.ahajournals.org               | no-pdf-link         | 102653
+     society.kisti.re.kr               | no-pdf-link         | 101787
+     www.degruyter.com                 | redirect-loop       |  95130
+     www.nature.com                    | redirect-loop       |  87534
+     onlinelibrary.wiley.com           | no-pdf-link         |  84432
+     www.cell.com                      | redirect-loop       |  61496
+     www.degruyter.com                 | terminal-bad-status |  42919
+     babel.hathitrust.org              | terminal-bad-status |  41813
+     www.ncbi.nlm.nih.gov              | redirect-loop       |  40488
+     scialert.net                      | no-pdf-link         |  38341
+     ashpublications.org               | no-pdf-link         |  34889
+     dialnet.unirioja.es               | terminal-bad-status |  32076
+     www.journal.csj.jp                | no-pdf-link         |  30881
+     pure.mpg.de                       | redirect-loop       |  26163
+     www.jci.org                       | redirect-loop       |  24701
+     espace.library.uq.edu.au          | redirect-loop       |  24591
+     www.valueinhealthjournal.com      | redirect-loop       |  23740
+     www.vr-elibrary.de                | no-pdf-link         |  23332
+     aip.scitation.org                 | wrong-mimetype      |  22144
+     osf.io                            | redirect-loop       |  18513
+     www.journals.elsevier.com         | no-pdf-link         |  16710
+     www.spandidos-publications.com    | redirect-loop       |  15711
+     www.biorxiv.org                   | wrong-mimetype      |  15513
+    (30 rows)
+
+Dump lists for another iteration of bulk ingest:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status = 'no-capture'
+    ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
+    => 278,876
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            AND ingest_file_result.status != 'success'
+            AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+    ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
+    =>
+
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
+
+    cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
-- 
cgit v1.2.3