1 files changed, 352 insertions, 0 deletions
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md
new file mode 100644
index 0000000..f5c853d
--- /dev/null
+++ b/notes/ingest/2020-09_oa_doi.md
@@ -0,0 +1,352 @@
+
+It seems that many gold OA DOIs on were not ingesting simply because the HTML
+url extraction was not working for a particular version of OJS.
+
+Let's re-try all ~2.5 million of these in bulk mode and see how many are
+'no-capture' vs. other errors, then possibly re-crawl a large number.
+
+## Bulk Ingest
+
+Dump ingest requests
+
+    ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json
+    Expecting 2569876 release objects in search queries
+    Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034})
+
+Enqueue
+
+    cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started at about:
+
+    Thu Sep 17 00:15:00 UTC 2020
+    2020-09-17T00:15:00Z
+
+## Stats
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.ingest_request_source = 'fatcat-ingest'
+        AND ingest_file_result.updated >= '2020-09-16'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                   status                | count  
+    -------------------------------------+--------
+     no-capture                          | 513462
+     success                             | 206042
+     no-pdf-link                         | 186779
+     terminal-bad-status                 |  40372
+     redirect-loop                       |  33103
+     cdx-error                           |  24078
+     link-loop                           |  13494
+     spn2-cdx-lookup-failure             |  10247
+     gateway-timeout                     |   4407
+     wrong-mimetype                      |   3213
+     petabox-error                       |    866
+     null-body                           |    449
+     spn2-error                          |    217
+     wayback-error                       |    129
+     spn2-error:job-failed               |     64
+     bad-redirect                        |      6
+     spn2-error:soft-time-limit-exceeded |      1
+    (17 rows)
+
+This was only about half the requests. Try... broader?
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'doi'
+        AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+             OR ingest_request.ingest_request_source = 'fatcat-changelog')
+        AND ingest_file_result.updated >= '2020-09-15'
+        AND ingest_file_result.updated <= '2020-09-20'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                   status                | count  
+    -------------------------------------+--------
+     no-capture                          | 579952
+     success                             | 387325
+     no-pdf-link                         | 380406
+     terminal-bad-status                 |  63743
+     redirect-loop                       |  53893
+     cdx-error                           |  46024
+     spn2-cdx-lookup-failure             |  28347
+     link-loop                           |  22573
+     gateway-timeout                     |  11686
+     wrong-mimetype                      |   6294
+     null-body                           |   3509
+     petabox-error                       |   2388
+     spn2-error                          |   1023
+     spn2-error:job-failed               |    462
+     wayback-error                       |    347
+     spn2-error:soft-time-limit-exceeded |     20
+     bad-redirect                        |     11
+    (17 rows)
+
+What top domains for those `no-pdf-link` (or similar)?
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'doi'
+            AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+                OR ingest_request.ingest_request_source = 'fatcat-changelog')
+            AND ingest_file_result.updated >= '2020-09-15'
+            AND ingest_file_result.updated <= '2020-09-20'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                domain            |         status          | count
+    ------------------------------+-------------------------+-------
+     zenodo.org                   | no-pdf-link             | 56488
+     figshare.com                 | no-pdf-link             | 55337
+     www.egms.de                  | redirect-loop           | 22686
+     zenodo.org                   | terminal-bad-status     | 22128
+     tandf.figshare.com           | no-pdf-link             | 20027
+     springernature.figshare.com  | no-pdf-link             | 17181
+     cairn.info                   | terminal-bad-status     | 13836
+     www.persee.fr                | terminal-bad-status     |  7565
+     projecteuclid.org            | link-loop               |  7449
+     www.cairn.info               | no-pdf-link             |  6992
+     scialert.net                 | no-pdf-link             |  6621
+     www.cairn.info               | link-loop               |  5870
+     utpjournals.press            | no-pdf-link             |  5772
+     journals.openedition.org     | redirect-loop           |  5464
+     www.egms.de                  | no-pdf-link             |  5223
+     archaeologydataservice.ac.uk | no-pdf-link             |  4881
+     rs.figshare.com              | no-pdf-link             |  4773
+     www.degruyter.com            | spn2-cdx-lookup-failure |  4763
+     koreascience.or.kr           | no-pdf-link             |  4487
+     cancerres.aacrjournals.org   | no-pdf-link             |  4124
+     cms.math.ca                  | no-pdf-link             |  3441
+     volcano.si.edu               | no-pdf-link             |  3424
+     www.mathnet.ru               | no-pdf-link             |  3229
+     tidsskriftet.no              | no-pdf-link             |  3012
+     journals.plos.org            | no-pdf-link             |  3005
+     tudigit.ulb.tu-darmstadt.de  | no-pdf-link             |  2796
+     www.cairn.info:80            | link-loop               |  2647
+     hammer.figshare.com          | no-pdf-link             |  2627
+     www.psychosocial.com         | no-pdf-link             |  2457
+     osf.io                       | terminal-bad-status     |  2388
+    (30 rows)
+
+Should look at link extraction for:
+
+- scialert.net
+- utpjournals.press
+- koreascience.or.kr
+- cancerres.aacrjournals.org
+- cms.math.ca
+- volcano.si.edu
+- www.mathnet.ru
+- www.psychosocial.com
+
+## Re-Ingest
+
+Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'doi'
+            AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+                OR ingest_request.ingest_request_source = 'fatcat-changelog')
+            AND ingest_file_result.updated >= '2020-09-15'
+            AND ingest_file_result.updated <= '2020-09-20'
+            AND ingest_file_result.status = 'no-capture'
+            -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+    ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json';
+    => COPY 579952
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json
+    => 579k 0:00:22 [25.9k/s]
+
+    cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Resuming progress on this in early December 2020.
+
+Filtered requests to re-crawl:
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doi'
+                AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+                    OR ingest_request.ingest_request_source = 'fatcat-changelog')
+                AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20')
+                    OR (ingest_file_result.updated >= '2020-10-11'))
+                AND ingest_file_result.status != 'success'
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json';
+    => COPY 2352614
+
+Prep ingest requests (for post-crawl use):
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json
+
+And actually dump seedlist(s):
+
+    cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+    cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+    wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt
+      2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+       481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+Top DOI prefixes (same old usual suspects):
+
+    cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20
+     353695 10.5281     zenodo.org
+     121888 10.6084     figshare.org
+     115093 10.3917     cairn.info
+     113252 10.3406     persee.fr
+      95414 10.1515     degruyter.com
+      90448 10.4324     taylorfrancis.com
+      83927 10.1016     elsevier
+      60303 10.1109     IEEE
+      48490 10.4000     openedition.org
+      28498 10.3205     egms.de
+      23433 10.1163     brill.com
+      23276 10.17615    cdr.lib.unc.edu
+      21386 10.1093     oup.com
+      20783 10.3138     utpjournals.press
+      19987 10.1201     tandfonline.com
+      17916 10.34847    cocoon.huma-num.fr
+      16970 10.1002     wiley.com
+      15958 10.1097     lww.com (and others?)
+      15835 10.1017     cambridge.org
+      15466 10.24355    publikationsserver.tu-braunschweig.de (IR)
+
+Top domains (not doi.org):
+
+    cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+     104148 zenodo.org
+      85245 www.persee.fr
+      52931 www.cairn.info
+       4791 www.jstage.jst.go.jp
+       4411 archive.monthlyreview.org
+       4129 osf.io
+       2841 www.indianjournals.com
+       2746 www.impan.pl
+       2620 platform.almanhal.com
+       2019 www.nomos-elibrary.de
+       1209 dergipark.org.tr
+       1027 pubs.geoscienceworld.org
+        973 www.pdcnet.org
+        923 www.hanspub.org
+        914 www.repository.cam.ac.uk
+        863 mediarep.org
+        812 www.cartographicperspectives.org
+        687 www.degruyter.com
+        578 192.168.7.24
+        566 journals.eco-vector.com
+
+TODO: infer `publisher_type` and platform from DOI prefix in more cases
+
+## Re-Ingest
+
+Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3
+million requests. Note these are all `pdf` requests, but crawl was done in an
+HTML-friendly way, so should be able to do domain/journal-specific HTML ingests
+in the future.
+
+    cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Stats, for this ingest period (fuzzy; will have some daily ingest stuff):
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'doi'
+        AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+             OR ingest_request.ingest_request_source = 'fatcat-changelog')
+        AND ingest_file_result.updated >= '2020-12-28'
+        AND ingest_request.created <= '2020-12-09'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+            status         | count  
+    -----------------------+--------
+     no-pdf-link           | 962714
+     success               | 539305
+     no-capture            | 306590
+     redirect-loop         | 192149
+     link-loop             | 184797
+     terminal-bad-status   | 141721
+     wrong-mimetype        |  10362
+     null-body             |  10277
+     skip-url-blocklist    |   1985
+     wayback-content-error |   1300
+     cdx-error             |    869
+     petabox-error         |    160
+     bad-redirect          |     72
+     wayback-error         |     46
+     bad-gzip-encoding     |      7
+     timeout               |      1
+     max-hops-exceeded     |      1
+    (17 rows)
+