diff options
Diffstat (limited to 'notes/ingest/2020-07_mag.md')
-rw-r--r-- | notes/ingest/2020-07_mag.md | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md new file mode 100644 index 0000000..1d33162 --- /dev/null +++ b/notes/ingest/2020-07_mag.md @@ -0,0 +1,353 @@ + +Using 2020-06-25 upstream MAG corpus snapshot. + +Ran munging from `scratch:ingest/mag` notes first. + +Expecting a couple million new ingest request URLs; this is the first "patch" +MAG ingest on top of existing already-run requests. + +Planning to skip the initial bulk ingest step, on the assumption that new URLs +have either been ingested already (eg, via continuous ingest pipeline) or need +crawling. + +## Generate Requests + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json + => 28.7M 2:36:48 [3.06k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json + => 5.66M 0:29:28 [ 3.2k/s] + +## Persist Ingest Requests + + # small sample + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0}) + + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0}) + +## Crawl/Dupe Status + +Overall status for old and new seeds, filtering out large (blocking) +publishers: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 19477651 + | 8238898 + redirect-loop | 2036494 + link-loop | 1330036 + no-pdf-link | 1304820 + terminal-bad-status | 648150 + no-capture | 545785 + gateway-timeout | 200143 + cdx-error | 149995 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 57052 + wayback-error | 41032 + invalid-host-resolution | 37203 + petabox-error | 11167 + null-body | 6662 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 77 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + | 8238851 + success | 787174 + no-capture | 42864 + redirect-loop | 31718 + terminal-bad-status | 31493 + no-pdf-link | 13025 + cdx-error | 11275 + wrong-mimetype | 6238 + link-loop | 3365 + wayback-error | 748 + gateway-timeout | 506 + null-body | 191 + spn2-cdx-lookup-failure | 99 + petabox-error | 89 + invalid-host-resolution | 70 + spn2-error | 7 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + bad-gzip-encoding | 1 + (19 rows) + +Where are no-capture results terminating? May need to add or update heritrix +crawl config so that we get better yield without needing to do SPNv2 crawling. + + SELECT initial_domain, terminal_domain, COUNT(*) + FROM ( + SELECT + ingest_file_result.status as status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + ) t1 + GROUP BY initial_domain, terminal_domain + ORDER BY COUNT DESC + LIMIT 25; + + initial_domain | terminal_domain | count + ---------------------------------+---------------------+-------- + www.researchgate.net | | 334145 + academic.oup.com | | 205820 + www.tandfonline.com | | 148638 + journals.sagepub.com | | 144196 + muse.jhu.edu | | 55957 + hrcak.srce.hr | | 25317 + www.omicsonline.org | | 22426 + link.springer.com | | 21044 + iopscience.iop.org | | 12385 + bioone.org | | 9097 + tandfonline.com | | 8512 + or.nsfc.gov.cn | | 4823 + ieeexplore.ieee.org | ieeexplore.ieee.org | 4398 + pubs.acs.org | | 3708 + archive-ouverte.unige.ch | | 2743 + dergipark.ulakbim.gov.tr | | 2677 + hal.archives-ouvertes.fr | | 1258 + dergipark.org.tr | | 1207 + apo.org.au | | 1186 + spire.sciencespo.fr | | 989 + cyberleninka.ru | | 895 + lirias.kuleuven.be | | 855 + tel.archives-ouvertes.fr | | 786 + pub.uni-bielefeld.de | | 728 + www.research-collection.ethz.ch | | 670 + (25 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status IS NULL) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json'; + => 8784683 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json + +Seedlist transform from here on covered in MAG crawl notes. + +## Bulk Ingest + +Run ingest requests on everything we crawled: + + cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Small sample: + + head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Updated Overall Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24574294 + redirect-loop | 2633731 + no-capture | 2458694 + no-pdf-link | 1896871 + link-loop | 1510899 + terminal-bad-status | 878821 + cdx-error | 387574 + gateway-timeout | 200246 + | 170304 + wayback-error | 97572 + spn2-cdx-lookup-failure | 80284 + wrong-mimetype | 65097 + invalid-host-resolution | 37204 + petabox-error | 12097 + null-body | 8549 + spn2-error | 1706 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + (20 rows) + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24557382 + redirect-loop | 2630582 + no-capture | 1947066 + no-pdf-link | 1778206 + link-loop | 1510790 + terminal-bad-status | 857173 + cdx-error | 384525 + gateway-timeout | 200143 + wayback-error | 96390 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 64908 + invalid-host-resolution | 37203 + petabox-error | 12087 + null-body | 8548 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + | 69 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+--------- + success | 5860601 + no-capture | 1489959 + redirect-loop | 619121 + no-pdf-link | 473703 + terminal-bad-status | 234753 + cdx-error | 231575 + link-loop | 184093 + wayback-error | 56068 + wrong-mimetype | 14046 + null-body | 2068 + petabox-error | 1006 + gateway-timeout | 506 + spn2-cdx-lookup-failure | 99 + invalid-host-resolution | 70 + | 22 + bad-redirect | 13 + spn2-error | 7 + timeout | 3 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + (20 rows) + |