diff options
Diffstat (limited to 'notes/ingest/2020-09_oa_doi.md')
-rw-r--r-- | notes/ingest/2020-09_oa_doi.md | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md new file mode 100644 index 0000000..f5c853d --- /dev/null +++ b/notes/ingest/2020-09_oa_doi.md @@ -0,0 +1,352 @@ + +It seems that many gold OA DOIs on were not ingesting simply because the HTML +url extraction was not working for a particular version of OJS. + +Let's re-try all ~2.5 million of these in bulk mode and see how many are +'no-capture' vs. other errors, then possibly re-crawl a large number. + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json + Expecting 2569876 release objects in search queries + Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034}) + +Enqueue + + cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started at about: + + Thu Sep 17 00:15:00 UTC 2020 + 2020-09-17T00:15:00Z + +## Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND ingest_file_result.updated >= '2020-09-16' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 513462 + success | 206042 + no-pdf-link | 186779 + terminal-bad-status | 40372 + redirect-loop | 33103 + cdx-error | 24078 + link-loop | 13494 + spn2-cdx-lookup-failure | 10247 + gateway-timeout | 4407 + wrong-mimetype | 3213 + petabox-error | 866 + null-body | 449 + spn2-error | 217 + wayback-error | 129 + spn2-error:job-failed | 64 + bad-redirect | 6 + spn2-error:soft-time-limit-exceeded | 1 + (17 rows) + +This was only about half the requests. Try... broader? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 579952 + success | 387325 + no-pdf-link | 380406 + terminal-bad-status | 63743 + redirect-loop | 53893 + cdx-error | 46024 + spn2-cdx-lookup-failure | 28347 + link-loop | 22573 + gateway-timeout | 11686 + wrong-mimetype | 6294 + null-body | 3509 + petabox-error | 2388 + spn2-error | 1023 + spn2-error:job-failed | 462 + wayback-error | 347 + spn2-error:soft-time-limit-exceeded | 20 + bad-redirect | 11 + (17 rows) + +What top domains for those `no-pdf-link` (or similar)? + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 56488 + figshare.com | no-pdf-link | 55337 + www.egms.de | redirect-loop | 22686 + zenodo.org | terminal-bad-status | 22128 + tandf.figshare.com | no-pdf-link | 20027 + springernature.figshare.com | no-pdf-link | 17181 + cairn.info | terminal-bad-status | 13836 + www.persee.fr | terminal-bad-status | 7565 + projecteuclid.org | link-loop | 7449 + www.cairn.info | no-pdf-link | 6992 + scialert.net | no-pdf-link | 6621 + www.cairn.info | link-loop | 5870 + utpjournals.press | no-pdf-link | 5772 + journals.openedition.org | redirect-loop | 5464 + www.egms.de | no-pdf-link | 5223 + archaeologydataservice.ac.uk | no-pdf-link | 4881 + rs.figshare.com | no-pdf-link | 4773 + www.degruyter.com | spn2-cdx-lookup-failure | 4763 + koreascience.or.kr | no-pdf-link | 4487 + cancerres.aacrjournals.org | no-pdf-link | 4124 + cms.math.ca | no-pdf-link | 3441 + volcano.si.edu | no-pdf-link | 3424 + www.mathnet.ru | no-pdf-link | 3229 + tidsskriftet.no | no-pdf-link | 3012 + journals.plos.org | no-pdf-link | 3005 + tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796 + www.cairn.info:80 | link-loop | 2647 + hammer.figshare.com | no-pdf-link | 2627 + www.psychosocial.com | no-pdf-link | 2457 + osf.io | terminal-bad-status | 2388 + (30 rows) + +Should look at link extraction for: + +- scialert.net +- utpjournals.press +- koreascience.or.kr +- cancerres.aacrjournals.org +- cms.math.ca +- volcano.si.edu +- www.mathnet.ru +- www.psychosocial.com + +## Re-Ingest + +Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + AND ingest_file_result.status = 'no-capture' + -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json'; + => COPY 579952 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json + => 579k 0:00:22 [25.9k/s] + + cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Resuming progress on this in early December 2020. + +Filtered requests to re-crawl: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20') + OR (ingest_file_result.updated >= '2020-10-11')) + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json'; + => COPY 2352614 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json + +And actually dump seedlist(s): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + + wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt + 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + +Top DOI prefixes (same old usual suspects): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20 + 353695 10.5281 zenodo.org + 121888 10.6084 figshare.org + 115093 10.3917 cairn.info + 113252 10.3406 persee.fr + 95414 10.1515 degruyter.com + 90448 10.4324 taylorfrancis.com + 83927 10.1016 elsevier + 60303 10.1109 IEEE + 48490 10.4000 openedition.org + 28498 10.3205 egms.de + 23433 10.1163 brill.com + 23276 10.17615 cdr.lib.unc.edu + 21386 10.1093 oup.com + 20783 10.3138 utpjournals.press + 19987 10.1201 tandfonline.com + 17916 10.34847 cocoon.huma-num.fr + 16970 10.1002 wiley.com + 15958 10.1097 lww.com (and others?) + 15835 10.1017 cambridge.org + 15466 10.24355 publikationsserver.tu-braunschweig.de (IR) + +Top domains (not doi.org): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 104148 zenodo.org + 85245 www.persee.fr + 52931 www.cairn.info + 4791 www.jstage.jst.go.jp + 4411 archive.monthlyreview.org + 4129 osf.io + 2841 www.indianjournals.com + 2746 www.impan.pl + 2620 platform.almanhal.com + 2019 www.nomos-elibrary.de + 1209 dergipark.org.tr + 1027 pubs.geoscienceworld.org + 973 www.pdcnet.org + 923 www.hanspub.org + 914 www.repository.cam.ac.uk + 863 mediarep.org + 812 www.cartographicperspectives.org + 687 www.degruyter.com + 578 192.168.7.24 + 566 journals.eco-vector.com + +TODO: infer `publisher_type` and platform from DOI prefix in more cases + +## Re-Ingest + +Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3 +million requests. Note these are all `pdf` requests, but crawl was done in an +HTML-friendly way, so should be able to do domain/journal-specific HTML ingests +in the future. + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Stats, for this ingest period (fuzzy; will have some daily ingest stuff): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-12-28' + AND ingest_request.created <= '2020-12-09' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -----------------------+-------- + no-pdf-link | 962714 + success | 539305 + no-capture | 306590 + redirect-loop | 192149 + link-loop | 184797 + terminal-bad-status | 141721 + wrong-mimetype | 10362 + null-body | 10277 + skip-url-blocklist | 1985 + wayback-content-error | 1300 + cdx-error | 869 + petabox-error | 160 + bad-redirect | 72 + wayback-error | 46 + bad-gzip-encoding | 7 + timeout | 1 + max-hops-exceeded | 1 + (17 rows) + |