diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-06 20:01:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-06 20:01:15 -0700 |
commit | 5dd9e8f6790de403376811a966496b8f612f192e (patch) | |
tree | aa7f3698edcbf8faa1fd1cd620aff44dbd80d33a /notes | |
parent | 35ebee00c38436ea8c8a075689d987d80841255a (diff) | |
download | sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.tar.gz sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.zip |
MAG 2020-03-04 ingest notes to date
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2020-03-04_mag.md | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md new file mode 100644 index 0000000..a5624c2 --- /dev/null +++ b/notes/ingest/2020-03-04_mag.md @@ -0,0 +1,395 @@ + +Rough plan: + +- run bulk and/or regular ingest requests for just those of AIT partners (200k?) +- persist ingest requests (22 million or so) +- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall) +- crawl those which are no-capture + + +## Generate Requests + +Newer version of `mag_ingest_request.sh` script requires venv with urlcanon +installed. + +Starting with the 2020-01-23 MAG dump, will generate a full ingest request set +(including DOI `ext_id` when available), with any dominant domains removed (eg, +arxiv.org): + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json + => previously 25.6M + => 25.6M 2:29:43 [2.85k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json + => 4.3M 0:25:45 [2.78k/s] + + export LC_ALL=C + cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id + + zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l + => 6,504,907 + + zcat PaperUrls_mag_url_pmid.txt.gz | wc -l + => 4,369,832 + + cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l + => previously 15,707,405 + => 15,702,581 + + cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l + => 0 + URL encoding seems to be working + +## Persist Ingest Requests + +First pmid ingest requests, then the all/doi file. The reason to do this order +is that the all/doi file will have some rows with no DOI (and thus no +`ext_id`), while the PMID file will not. + + # small sample + head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 10, 'skip-result-fields': 10}) + JSON lines pushed: Counter({'total': 10, 'pushed': 10}) + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + => 4.3M 0:16:46 [4.27k/s] + Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0}) + JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026}) + => hit a bug on first attempt, which is why total/insert results don't match + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request - + => 25.6M 2:21:54 [3.01k/s] + Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0}) + JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559}) + + +## Crawl/Dupe Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +After just PMID links: + + status | count + ---------------------+--------- + | 3000115 + success | 1126881 + no-capture | 69459 + terminal-bad-status | 30259 + redirect-loop | 11656 + no-pdf-link | 2836 + wrong-mimetype | 1456 + link-loop | 1259 + wayback-error | 1232 + cdx-error | 932 + null-body | 85 + petabox-error | 50 + bad-redirect | 1 + (13 rows) + +After all links: + + SELECT COUNT(*) + FROM ingest_request + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag'; + => 25596563 + + + status | count + ---------------------+---------- + | 21130841 + success | 3915682 + no-capture | 391813 + terminal-bad-status | 76488 + redirect-loop | 44202 + wrong-mimetype | 16418 + no-pdf-link | 10995 + wayback-error | 3679 + cdx-error | 3414 + link-loop | 2098 + null-body | 709 + petabox-error | 221 + bad-gzip-encoding | 2 + bad-redirect | 1 + (14 rows) + +Somewhat more un-ingested than expected. + +Dump requests: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/mag_noingest_20200305.rows.json'; + => COPY 21,130,841 + +Transform and shuf: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz + => 21.1M 0:18:57 [18.6k/s] + +## Bulk Ingest Partner Output + +These are subsets of the full list from potential AIT-S partners; want to run +these through the pipeline before the full batch. Duplication against the full +batch should be minimal. + +Size: + + bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l + 29007 + bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json + 34265 ingest_requests_mag-2020-01-23.cornell.json + +Test ingest: + + head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full ingests: + + cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Bulk Ingest + +Shard it into batches of roughly 1 million: + + cd /grande/snapshots/ + zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json + +Add a single batch like: + + cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + partner ingests (see above) + => 2020-03-05 12:49: 118,396 + 1056543 mag_noingest_20200305.ingest_request.split_00.json + => 2020-03-05 14:34: 1,055,224 + => check on stats/ratios; filter by ingest update time? + 1056542 mag_noingest_20200305.ingest_request.split_01.json + 1056542 mag_noingest_20200305.ingest_request.split_02.json + 1056542 mag_noingest_20200305.ingest_request.split_03.json + 1056542 mag_noingest_20200305.ingest_request.split_04.json + 1056542 mag_noingest_20200305.ingest_request.split_05.json + 1056542 mag_noingest_20200305.ingest_request.split_06.json + 1056542 mag_noingest_20200305.ingest_request.split_07.json + 1056542 mag_noingest_20200305.ingest_request.split_08.json + 1056542 mag_noingest_20200305.ingest_request.split_09.json + => 2020-03-05 18:04: 10,009,297 + => 2020-03-06 16:53: 6,553,946 + 1056542 mag_noingest_20200305.ingest_request.split_10.json + 1056542 mag_noingest_20200305.ingest_request.split_11.json + 1056542 mag_noingest_20200305.ingest_request.split_12.json + 1056542 mag_noingest_20200305.ingest_request.split_13.json + 1056542 mag_noingest_20200305.ingest_request.split_14.json + 1056542 mag_noingest_20200305.ingest_request.split_15.json + 1056542 mag_noingest_20200305.ingest_request.split_16.json + 1056542 mag_noingest_20200305.ingest_request.split_17.json + 1056542 mag_noingest_20200305.ingest_request.split_18.json + 1056542 mag_noingest_20200305.ingest_request.split_19.json + => 2020-03-06 16:59: 17,001,032 + +Stats from bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + ---------------------+---------- + no-capture | 12237193 + success | 11991293 + no-pdf-link | 521691 + redirect-loop | 437192 + terminal-bad-status | 231181 + link-loop | 92633 + cdx-error | 33631 + wrong-mimetype | 28638 + wayback-error | 19651 + null-body | 2682 + petabox-error | 727 + | 47 + bad-redirect | 44 + bad-gzip-encoding | 7 + (14 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + --------------------------------------+---------------------+-------- + dialnet.unirioja.es | redirect-loop | 240967 + onlinelibrary.wiley.com | no-pdf-link | 147696 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 + iopscience.iop.org | terminal-bad-status | 69591 + febs.onlinelibrary.wiley.com | no-pdf-link | 49874 + www.researchgate.net | redirect-loop | 42859 + journals.sagepub.com | no-pdf-link | 27448 + papers.ssrn.com | redirect-loop | 27328 + dialnet.unirioja.es | terminal-bad-status | 20320 + physoc.onlinelibrary.wiley.com | no-pdf-link | 20232 + science.sciencemag.org | link-loop | 17811 + espace.library.uq.edu.au | redirect-loop | 17185 + bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301 + anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746 + www.tandfonline.com | no-pdf-link | 13303 + aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070 + link.springer.com | redirect-loop | 10594 + www.redalyc.org:9081 | no-pdf-link | 10515 + watermark.silverchair.com | terminal-bad-status | 9739 + www.bmj.com | link-loop | 9389 + www.repository.naturalis.nl | redirect-loop | 8213 + bjp.rcpsych.org | link-loop | 8045 + aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814 + nph.onlinelibrary.wiley.com | no-pdf-link | 7801 + iopscience.iop.org | redirect-loop | 7697 + journals.tubitak.gov.tr | wrong-mimetype | 7159 + www.biorxiv.org | wrong-mimetype | 7067 + www.erudit.org | redirect-loop | 6819 + besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254 + (30 rows) + +Domains to follow-up (eg, sandcrawler ingest tests/tweaks): +- dialnet.unirioja.es | redirect-loop | 240967 +- www.researchgate.net | redirect-loop | 42859 +- www.redalyc.org:9081 | no-pdf-link | 10515 +- www.repository.naturalis.nl | redirect-loop | 8213 +- bjp.rcpsych.org | link-loop | 8045 +- journals.tubitak.gov.tr | wrong-mimetype | 7159 +- www.erudit.org | redirect-loop | 6819 + +The dialnet.unirioja.es ones may be worth re-crawling via heritrix? + +Top uncrawled domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------+------------+-------- + ieeexplore.ieee.org | no-capture | 957835 + link.springer.com | no-capture | 394121 + www.researchgate.net | no-capture | 376974 + cyberleninka.ru | no-capture | 376012 + iopscience.iop.org | no-capture | 348791 + papers.ssrn.com | no-capture | 286860 + dergipark.org.tr | no-capture | 217556 + dialnet.unirioja.es | no-capture | 214398 + academic.oup.com | no-capture | 212364 + www.tandfonline.com | no-capture | 148940 + journals.sagepub.com | no-capture | 144695 + www.papersearch.net | no-capture | 138986 + absimage.aps.org | no-capture | 111976 + apps.dtic.mil | no-capture | 106984 + www.cambridge.org | no-capture | 97533 + www.bmj.com | no-capture | 92437 + bioone.org | no-capture | 87573 + science.sciencemag.org | no-capture | 75723 + shodhganga.inflibnet.ac.in:8080 | no-capture | 75395 + www.jstor.org | no-capture | 73230 + works.bepress.com | no-capture | 68747 + www.scielo.org.co | no-capture | 59650 + hrcak.srce.hr | no-capture | 59332 + muse.jhu.edu | no-capture | 57828 + onlinelibrary.wiley.com | no-capture | 55621 + www.jbc.org | no-capture | 54608 + www.jstage.jst.go.jp | no-capture | 53631 + www.redalyc.org | no-capture | 50406 + lup.lub.lu.se | no-capture | 47469 + www.dtic.mil | no-capture | 41820 + (30 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json'; + => COPY 11714199 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json |