diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:12:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:12:14 -0700 |
commit | 0c924003291ea1a41cf0543463d2b9ce43d123fd (patch) | |
tree | cb2b172861e0b6d724e8b56987b0c108f5156a92 /notes | |
parent | 1695c1bcd0d084a2004ce24bc01fd4515b49c06d (diff) | |
download | sandcrawler-0c924003291ea1a41cf0543463d2b9ce43d123fd.tar.gz sandcrawler-0c924003291ea1a41cf0543463d2b9ce43d123fd.zip |
OA DOIs: partial notes
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2020-09_oa_doi.md | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md new file mode 100644 index 0000000..c8ed6e9 --- /dev/null +++ b/notes/ingest/2020-09_oa_doi.md @@ -0,0 +1,218 @@ + +It seems that many gold OA DOIs on were not ingesting simply because the HTML +url extraction was not working for a particular version of OJS. + +Let's re-try all ~2.5 million of these in bulk mode and see how many are +'no-capture' vs. other errors, then possibly re-crawl a large number. + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json + Expecting 2569876 release objects in search queries + Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034}) + +Enqueue + + cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started at about: + + Thu Sep 17 00:15:00 UTC 2020 + 2020-09-17T00:15:00Z + +## Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND ingest_file_result.updated >= '2020-09-16' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 513462 + success | 206042 + no-pdf-link | 186779 + terminal-bad-status | 40372 + redirect-loop | 33103 + cdx-error | 24078 + link-loop | 13494 + spn2-cdx-lookup-failure | 10247 + gateway-timeout | 4407 + wrong-mimetype | 3213 + petabox-error | 866 + null-body | 449 + spn2-error | 217 + wayback-error | 129 + spn2-error:job-failed | 64 + bad-redirect | 6 + spn2-error:soft-time-limit-exceeded | 1 + (17 rows) + +This was only about half the requests. Try... broader? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 579952 + success | 387325 + no-pdf-link | 380406 + terminal-bad-status | 63743 + redirect-loop | 53893 + cdx-error | 46024 + spn2-cdx-lookup-failure | 28347 + link-loop | 22573 + gateway-timeout | 11686 + wrong-mimetype | 6294 + null-body | 3509 + petabox-error | 2388 + spn2-error | 1023 + spn2-error:job-failed | 462 + wayback-error | 347 + spn2-error:soft-time-limit-exceeded | 20 + bad-redirect | 11 + (17 rows) + +What top domains for those `no-pdf-link` (or similar)? + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 56488 + figshare.com | no-pdf-link | 55337 + www.egms.de | redirect-loop | 22686 + zenodo.org | terminal-bad-status | 22128 + tandf.figshare.com | no-pdf-link | 20027 + springernature.figshare.com | no-pdf-link | 17181 + cairn.info | terminal-bad-status | 13836 + www.persee.fr | terminal-bad-status | 7565 + projecteuclid.org | link-loop | 7449 + www.cairn.info | no-pdf-link | 6992 + scialert.net | no-pdf-link | 6621 + www.cairn.info | link-loop | 5870 + utpjournals.press | no-pdf-link | 5772 + journals.openedition.org | redirect-loop | 5464 + www.egms.de | no-pdf-link | 5223 + archaeologydataservice.ac.uk | no-pdf-link | 4881 + rs.figshare.com | no-pdf-link | 4773 + www.degruyter.com | spn2-cdx-lookup-failure | 4763 + koreascience.or.kr | no-pdf-link | 4487 + cancerres.aacrjournals.org | no-pdf-link | 4124 + cms.math.ca | no-pdf-link | 3441 + volcano.si.edu | no-pdf-link | 3424 + www.mathnet.ru | no-pdf-link | 3229 + tidsskriftet.no | no-pdf-link | 3012 + journals.plos.org | no-pdf-link | 3005 + tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796 + www.cairn.info:80 | link-loop | 2647 + hammer.figshare.com | no-pdf-link | 2627 + www.psychosocial.com | no-pdf-link | 2457 + osf.io | terminal-bad-status | 2388 + (30 rows) + +Will look at link extraction for: + +- scialert.net +- utpjournals.press +- koreascience.or.kr +- cancerres.aacrjournals.org +- cms.math.ca +- volcano.si.edu +- www.mathnet.ru +- www.psychosocial.com + +## Re-Ingest + +Going to re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + AND ingest_file_result.status = 'no-capture' + -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json'; + => COPY 579952 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json + => 579k 0:00:22 [25.9k/s] + + cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +After that, will re-crawl somewhat broadly: + + COPY ( + SELECT row_to_json(r) FROM ( + SELECT ingest_request.*, ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20') + OR (ingest_file_result.updated >= '2020-10-11')) + AND ingest_file_result.status != 'success' + ) r + ) TO '/grande/snapshots/oa_doi_reingest_recrawl_20201014.rows.json'; + |