From 6de98c221cc9fe1e5410c52a08b1d3b7470cd6ea Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 28 Apr 2020 10:33:28 -0700
Subject: update MAG crawl notes

---
 notes/ingest/2020-03-04_mag.md | 71 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'notes')

diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
index 97594c8..9b000a3 100644
--- a/notes/ingest/2020-03-04_mag.md
+++ b/notes/ingest/2020-03-04_mag.md
@@ -406,3 +406,74 @@ Full run:
 
     2020-04-07 12:19 (pacific): 11,703,871
 
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+                status             |  count
+    -------------------------------+----------
+     success                       | 18491799
+     redirect-loop                 |  1968530
+     no-capture                    |  1373657
+     no-pdf-link                   |  1311842
+     link-loop                     |  1296439
+     terminal-bad-status           |   627577
+     cdx-error                     |   418278
+     wrong-mimetype                |    50141
+     wayback-error                 |    37159
+     petabox-error                 |    11249
+     null-body                     |     6295
+     gateway-timeout               |     3051
+     spn2-cdx-lookup-failure       |      328
+     spn2-error:invalid-url-syntax |       93
+     bad-redirect                  |       75
+                                   |       47
+     invalid-host-resolution       |       28
+     spn2-error                    |       10
+     bad-gzip-encoding             |        7
+     redirects-exceeded            |        2
+    (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+    => 859849
+
+What domains are these?
+
+    cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+    cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+    wc -l mag_nocapture_20200420.rows.filtered.json
+    423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+    cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
-- 
cgit v1.2.3