1 files changed, 353 insertions, 0 deletions
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md
new file mode 100644
index 0000000..1d33162
--- /dev/null
+++ b/notes/ingest/2020-07_mag.md
@@ -0,0 +1,353 @@
+
+Using 2020-06-25 upstream MAG corpus snapshot.
+
+Ran munging from `scratch:ingest/mag` notes first.
+
+Expecting a couple million new ingest request URLs; this is the first "patch"
+MAG ingest on top of existing already-run requests.
+
+Planning to skip the initial bulk ingest step, on the assumption that new URLs
+have either been ingested already (eg, via continuous ingest pipeline) or need
+crawling.
+
+## Generate Requests
+
+    export LC_ALL=C
+    cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json
+    => 28.7M 2:36:48 [3.06k/s]
+
+    export LC_ALL=C
+    zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json
+    => 5.66M 0:29:28 [ 3.2k/s]
+
+## Persist Ingest Requests
+
+    # small sample
+    head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0})
+
+    head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+    Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0})
+
+    cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0})
+
+    cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0})
+
+## Crawl/Dupe Status
+
+Overall status for old and new seeds, filtering out large (blocking)
+publishers:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+        AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+        AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+        AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count
+    -------------------------------------+----------
+     success                             | 19477651
+                                         |  8238898
+     redirect-loop                       |  2036494
+     link-loop                           |  1330036
+     no-pdf-link                         |  1304820
+     terminal-bad-status                 |   648150
+     no-capture                          |   545785
+     gateway-timeout                     |   200143
+     cdx-error                           |   149995
+     spn2-cdx-lookup-failure             |    80010
+     wrong-mimetype                      |    57052
+     wayback-error                       |    41032
+     invalid-host-resolution             |    37203
+     petabox-error                       |    11167
+     null-body                           |     6662
+     spn2-error                          |     1698
+     spn2-error:job-failed               |      775
+     spn2-error:invalid-url-syntax       |      335
+     spn2-error:soft-time-limit-exceeded |      191
+     bad-redirect                        |       77
+    (20 rows)
+
+Just the new seeds:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+        AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+        AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+        AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+        AND ingest_request.created > '2020-06-20'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count  
+    -------------------------------------+---------
+                                         | 8238851
+     success                             |  787174
+     no-capture                          |   42864
+     redirect-loop                       |   31718
+     terminal-bad-status                 |   31493
+     no-pdf-link                         |   13025
+     cdx-error                           |   11275
+     wrong-mimetype                      |    6238
+     link-loop                           |    3365
+     wayback-error                       |     748
+     gateway-timeout                     |     506
+     null-body                           |     191
+     spn2-cdx-lookup-failure             |      99
+     petabox-error                       |      89
+     invalid-host-resolution             |      70
+     spn2-error                          |       7
+     spn2-error:job-failed               |       2
+     spn2-error:soft-time-limit-exceeded |       1
+     bad-gzip-encoding                   |       1
+    (19 rows)
+
+Where are no-capture results terminating? May need to add or update heritrix
+crawl config so that we get better yield without needing to do SPNv2 crawling.
+
+    SELECT initial_domain, terminal_domain, COUNT(*)
+    FROM (
+        SELECT
+            ingest_file_result.status as status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+    ) t1
+    GROUP BY initial_domain, terminal_domain
+    ORDER BY COUNT DESC
+    LIMIT 25;
+
+             initial_domain          |   terminal_domain   | count  
+    ---------------------------------+---------------------+--------
+     www.researchgate.net            |                     | 334145
+     academic.oup.com                |                     | 205820
+     www.tandfonline.com             |                     | 148638
+     journals.sagepub.com            |                     | 144196
+     muse.jhu.edu                    |                     |  55957
+     hrcak.srce.hr                   |                     |  25317
+     www.omicsonline.org             |                     |  22426
+     link.springer.com               |                     |  21044
+     iopscience.iop.org              |                     |  12385
+     bioone.org                      |                     |   9097
+     tandfonline.com                 |                     |   8512
+     or.nsfc.gov.cn                  |                     |   4823
+     ieeexplore.ieee.org             | ieeexplore.ieee.org |   4398
+     pubs.acs.org                    |                     |   3708
+     archive-ouverte.unige.ch        |                     |   2743
+     dergipark.ulakbim.gov.tr        |                     |   2677
+     hal.archives-ouvertes.fr        |                     |   1258
+     dergipark.org.tr                |                     |   1207
+     apo.org.au                      |                     |   1186
+     spire.sciencespo.fr             |                     |    989
+     cyberleninka.ru                 |                     |    895
+     lirias.kuleuven.be              |                     |    855
+     tel.archives-ouvertes.fr        |                     |    786
+     pub.uni-bielefeld.de            |                     |    728
+     www.research-collection.ethz.ch |                     |    670
+    (25 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND (ingest_file_result.status = 'no-capture'
+                 OR ingest_file_result.status IS NULL)
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json';
+    => 8784683
+
+    # in sandcrawler pipenv
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json
+
+Seedlist transform from here on covered in MAG crawl notes.
+
+## Bulk Ingest
+
+Run ingest requests on everything we crawled:
+
+    cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Small sample:
+
+    head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+    cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 
+
+## Updated Overall Stats
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count   
+    -------------------------------------+----------
+     success                             | 24574294
+     redirect-loop                       |  2633731
+     no-capture                          |  2458694
+     no-pdf-link                         |  1896871
+     link-loop                           |  1510899
+     terminal-bad-status                 |   878821
+     cdx-error                           |   387574
+     gateway-timeout                     |   200246
+                                         |   170304
+     wayback-error                       |    97572
+     spn2-cdx-lookup-failure             |    80284
+     wrong-mimetype                      |    65097
+     invalid-host-resolution             |    37204
+     petabox-error                       |    12097
+     null-body                           |     8549
+     spn2-error                          |     1706
+     spn2-error:job-failed               |      775
+     spn2-error:invalid-url-syntax       |      335
+     spn2-error:soft-time-limit-exceeded |      191
+     bad-redirect                        |       90
+    (20 rows)
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+        AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+        AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+        AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+                   status                |  count
+    -------------------------------------+----------
+     success                             | 24557382
+     redirect-loop                       |  2630582
+     no-capture                          |  1947066
+     no-pdf-link                         |  1778206
+     link-loop                           |  1510790
+     terminal-bad-status                 |   857173
+     cdx-error                           |   384525
+     gateway-timeout                     |   200143
+     wayback-error                       |    96390
+     spn2-cdx-lookup-failure             |    80010
+     wrong-mimetype                      |    64908
+     invalid-host-resolution             |    37203
+     petabox-error                       |    12087
+     null-body                           |     8548
+     spn2-error                          |     1698
+     spn2-error:job-failed               |      775
+     spn2-error:invalid-url-syntax       |      335
+     spn2-error:soft-time-limit-exceeded |      191
+     bad-redirect                        |       90
+                                         |       69
+    (20 rows)
+
+Just the new seeds:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+        AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+        AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+        AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+        AND ingest_request.created > '2020-06-20'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+
+                   status                |  count  
+    -------------------------------------+---------
+     success                             | 5860601
+     no-capture                          | 1489959
+     redirect-loop                       |  619121
+     no-pdf-link                         |  473703
+     terminal-bad-status                 |  234753
+     cdx-error                           |  231575
+     link-loop                           |  184093
+     wayback-error                       |   56068
+     wrong-mimetype                      |   14046
+     null-body                           |    2068
+     petabox-error                       |    1006
+     gateway-timeout                     |     506
+     spn2-cdx-lookup-failure             |      99
+     invalid-host-resolution             |      70
+                                         |      22
+     bad-redirect                        |      13
+     spn2-error                          |       7
+     timeout                             |       3
+     spn2-error:job-failed               |       2
+     spn2-error:soft-time-limit-exceeded |       1
+    (20 rows)
+