MAG 2020-03-04 ingest notes to date

author: Bryan Newbold <bnewbold@archive.org> 2020-04-06 20:01:15 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-04-06 20:01:15 -0700
commit: 5dd9e8f6790de403376811a966496b8f612f192e (patch)
tree: aa7f3698edcbf8faa1fd1cd620aff44dbd80d33a /notes
parent: 35ebee00c38436ea8c8a075689d987d80841255a (diff)
download: sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.tar.gz
sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.zip
1 files changed, 395 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
new file mode 100644
index 0000000..a5624c2
--- /dev/null
+++ b/notes/ingest/2020-03-04_mag.md
@@ -0,0 +1,395 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+    export LC_ALL=C
+    cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+    => previously 25.6M
+    => 25.6M 2:29:43 [2.85k/s]
+
+    export LC_ALL=C
+    zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+    => 4.3M 0:25:45 [2.78k/s]
+
+    export LC_ALL=C
+    cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+    zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+    => 6,504,907
+
+    zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+    => 4,369,832
+
+    cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+    => previously 15,707,405
+    => 15,702,581
+
+    cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+    => 0
+    URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+    # small sample
+    head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+    Worker: Counter({'total': 10, 'skip-result-fields': 10})
+    JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+    => 4.3M 0:16:46 [4.27k/s]
+    Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+    JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+    => hit a bug on first attempt, which is why total/insert results don't match
+
+    cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+    => 25.6M 2:21:54 [3.01k/s]
+    Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+    JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+After just PMID links:
+
+           status        |  count
+    ---------------------+---------
+                         | 3000115
+     success             | 1126881
+     no-capture          |   69459
+     terminal-bad-status |   30259
+     redirect-loop       |   11656
+     no-pdf-link         |    2836
+     wrong-mimetype      |    1456
+     link-loop           |    1259
+     wayback-error       |    1232
+     cdx-error           |     932
+     null-body           |      85
+     petabox-error       |      50
+     bad-redirect        |       1
+    (13 rows)
+
+After all links:
+
+    SELECT COUNT(*)
+    FROM ingest_request
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag';
+    => 25596563
+
+
+           status        |  count   
+    ---------------------+----------
+                         | 21130841
+     success             |  3915682
+     no-capture          |   391813
+     terminal-bad-status |    76488
+     redirect-loop       |    44202
+     wrong-mimetype      |    16418
+     no-pdf-link         |    10995
+     wayback-error       |     3679
+     cdx-error           |     3414
+     link-loop           |     2098
+     null-body           |      709
+     petabox-error       |      221
+     bad-gzip-encoding   |        2
+     bad-redirect        |        1
+    (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+    => COPY 21,130,841
+
+Transform and shuf:
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+    => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+    bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+    29007
+    bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+    34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+    head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+    cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+    cd /grande/snapshots/
+    zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+    cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    partner ingests (see above)
+    => 2020-03-05 12:49:   118,396
+    1056543 mag_noingest_20200305.ingest_request.split_00.json
+    => 2020-03-05 14:34: 1,055,224
+    => check on stats/ratios; filter by ingest update time?
+    1056542 mag_noingest_20200305.ingest_request.split_01.json
+    1056542 mag_noingest_20200305.ingest_request.split_02.json
+    1056542 mag_noingest_20200305.ingest_request.split_03.json
+    1056542 mag_noingest_20200305.ingest_request.split_04.json
+    1056542 mag_noingest_20200305.ingest_request.split_05.json
+    1056542 mag_noingest_20200305.ingest_request.split_06.json
+    1056542 mag_noingest_20200305.ingest_request.split_07.json
+    1056542 mag_noingest_20200305.ingest_request.split_08.json
+    1056542 mag_noingest_20200305.ingest_request.split_09.json
+    => 2020-03-05 18:04: 10,009,297
+    => 2020-03-06 16:53:  6,553,946
+    1056542 mag_noingest_20200305.ingest_request.split_10.json
+    1056542 mag_noingest_20200305.ingest_request.split_11.json
+    1056542 mag_noingest_20200305.ingest_request.split_12.json
+    1056542 mag_noingest_20200305.ingest_request.split_13.json
+    1056542 mag_noingest_20200305.ingest_request.split_14.json
+    1056542 mag_noingest_20200305.ingest_request.split_15.json
+    1056542 mag_noingest_20200305.ingest_request.split_16.json
+    1056542 mag_noingest_20200305.ingest_request.split_17.json
+    1056542 mag_noingest_20200305.ingest_request.split_18.json
+    1056542 mag_noingest_20200305.ingest_request.split_19.json
+    => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'mag'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+           status        |  count
+    ---------------------+----------
+     no-capture          | 12237193
+     success             | 11991293
+     no-pdf-link         |   521691
+     redirect-loop       |   437192
+     terminal-bad-status |   231181
+     link-loop           |    92633
+     cdx-error           |    33631
+     wrong-mimetype      |    28638
+     wayback-error       |    19651
+     null-body           |     2682
+     petabox-error       |      727
+                         |       47
+     bad-redirect        |       44
+     bad-gzip-encoding   |        7
+    (14 rows)
+
+Failures by domain:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE 
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status != 'success'
+        AND t1.status != 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                    domain                |       status        | count  
+    --------------------------------------+---------------------+--------
+     dialnet.unirioja.es                  | redirect-loop       | 240967
+     onlinelibrary.wiley.com              | no-pdf-link         | 147696
+     agupubs.onlinelibrary.wiley.com      | no-pdf-link         |  72639
+     iopscience.iop.org                   | terminal-bad-status |  69591
+     febs.onlinelibrary.wiley.com         | no-pdf-link         |  49874
+     www.researchgate.net                 | redirect-loop       |  42859
+     journals.sagepub.com                 | no-pdf-link         |  27448
+     papers.ssrn.com                      | redirect-loop       |  27328
+     dialnet.unirioja.es                  | terminal-bad-status |  20320
+     physoc.onlinelibrary.wiley.com       | no-pdf-link         |  20232
+     science.sciencemag.org               | link-loop           |  17811
+     espace.library.uq.edu.au             | redirect-loop       |  17185
+     bpspubs.onlinelibrary.wiley.com      | no-pdf-link         |  15785
+     obgyn.onlinelibrary.wiley.com        | no-pdf-link         |  15301
+     anthrosource.onlinelibrary.wiley.com | no-pdf-link         |  13746
+     www.tandfonline.com                  | no-pdf-link         |  13303
+     aasldpubs.onlinelibrary.wiley.com    | no-pdf-link         |  11070
+     link.springer.com                    | redirect-loop       |  10594
+     www.redalyc.org:9081                 | no-pdf-link         |  10515
+     watermark.silverchair.com            | terminal-bad-status |   9739
+     www.bmj.com                          | link-loop           |   9389
+     www.repository.naturalis.nl          | redirect-loop       |   8213
+     bjp.rcpsych.org                      | link-loop           |   8045
+     aslopubs.onlinelibrary.wiley.com     | no-pdf-link         |   7814
+     nph.onlinelibrary.wiley.com          | no-pdf-link         |   7801
+     iopscience.iop.org                   | redirect-loop       |   7697
+     journals.tubitak.gov.tr              | wrong-mimetype      |   7159
+     www.biorxiv.org                      | wrong-mimetype      |   7067
+     www.erudit.org                       | redirect-loop       |   6819
+     besjournals.onlinelibrary.wiley.com  | no-pdf-link         |   6254
+    (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop |  42859
+- www.redalyc.org:9081 | no-pdf-link |  10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+    SELECT domain, status, COUNT((domain, status))
+    FROM (
+        SELECT
+            ingest_file_result.ingest_type,
+            ingest_file_result.status,
+            substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+        FROM ingest_file_result
+        LEFT JOIN ingest_request
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_file_result.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+    ) t1
+    WHERE t1.domain != ''
+        AND t1.status = 'no-capture'
+    GROUP BY domain, status
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+                 domain              |   status   | count  
+    ---------------------------------+------------+--------
+     ieeexplore.ieee.org             | no-capture | 957835
+     link.springer.com               | no-capture | 394121
+     www.researchgate.net            | no-capture | 376974
+     cyberleninka.ru                 | no-capture | 376012
+     iopscience.iop.org              | no-capture | 348791
+     papers.ssrn.com                 | no-capture | 286860
+     dergipark.org.tr                | no-capture | 217556
+     dialnet.unirioja.es             | no-capture | 214398
+     academic.oup.com                | no-capture | 212364
+     www.tandfonline.com             | no-capture | 148940
+     journals.sagepub.com            | no-capture | 144695
+     www.papersearch.net             | no-capture | 138986
+     absimage.aps.org                | no-capture | 111976
+     apps.dtic.mil                   | no-capture | 106984
+     www.cambridge.org               | no-capture |  97533
+     www.bmj.com                     | no-capture |  92437
+     bioone.org                      | no-capture |  87573
+     science.sciencemag.org          | no-capture |  75723
+     shodhganga.inflibnet.ac.in:8080 | no-capture |  75395
+     www.jstor.org                   | no-capture |  73230
+     works.bepress.com               | no-capture |  68747
+     www.scielo.org.co               | no-capture |  59650
+     hrcak.srce.hr                   | no-capture |  59332
+     muse.jhu.edu                    | no-capture |  57828
+     onlinelibrary.wiley.com         | no-capture |  55621
+     www.jbc.org                     | no-capture |  54608
+     www.jstage.jst.go.jp            | no-capture |  53631
+     www.redalyc.org                 | no-capture |  50406
+     lup.lub.lu.se                   | no-capture |  47469
+     www.dtic.mil                    | no-capture |  41820
+    (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+    COPY (  
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'mag'
+            AND ingest_file_result.status = 'no-capture'
+            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+            AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+            AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+            AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+    ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+    => COPY 11714199
+
+    # in sandcrawler pipenv
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
author	Bryan Newbold <bnewbold@archive.org>	2020-04-06 20:01:15 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-04-06 20:01:15 -0700
commit	5dd9e8f6790de403376811a966496b8f612f192e (patch)
tree	aa7f3698edcbf8faa1fd1cd620aff44dbd80d33a /notes
parent	35ebee00c38436ea8c8a075689d987d80841255a (diff)
download	sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.tar.gz sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.zip