From c575bbd728f518e5f783c48d14d69b01d2356509 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 9 Feb 2022 17:18:48 -0800 Subject: bulk cleanups: NCI chem entries; IRs with container_id; PLOS non-articles --- .../2022-02-08_nci_cambridge_datasets.md | 56 ++++++ extra/bulk_edits/2022-02-09_plos_non_articles.md | 69 +++++++ .../2022-02-09_repo_dois_with_containerid.md | 200 +++++++++++++++++++++ extra/bulk_edits/CHANGELOG.md | 5 + 4 files changed, 330 insertions(+) create mode 100644 extra/bulk_edits/2022-02-08_nci_cambridge_datasets.md create mode 100644 extra/bulk_edits/2022-02-09_plos_non_articles.md create mode 100644 extra/bulk_edits/2022-02-09_repo_dois_with_containerid.md (limited to 'extra') diff --git a/extra/bulk_edits/2022-02-08_nci_cambridge_datasets.md b/extra/bulk_edits/2022-02-08_nci_cambridge_datasets.md new file mode 100644 index 00000000..3172f16f --- /dev/null +++ b/extra/bulk_edits/2022-02-08_nci_cambridge_datasets.md @@ -0,0 +1,56 @@ + +Spectra DSpace Instance Cleanups +================================ + +Basic query: + + doi_prefix:10.14469 + +There were a big spike of these in 2014, marked as `article`, but should be +`dataset` (or `entry`). On the order of 150k releases. In particular, causes a +weird bump in unarchived OA papers in coverage plots for the year 2014. + +This is technically a dspace instance and might have various types of content +in it, so might want to narrow down the filter in some way. Eg, title prefix, +DOI pattern, etc. + + fatcat-cli search releases doi_prefix:10.14469 type:article --count + 196236 + + fatcat-cli search releases doi_prefix:10.14469 type:article 'title:NSC*' --count + 158380 + + fatcat-cli search releases doi_prefix:10.14469 type:article 'title:NSC*' author:"Imperial College High Performance Computing Service" --count + 158380 + +That seems to nail it down pretty well; these only fall under 2014 and a bit in +2015. + +Want to just mark these as `release_type:entry` (they are sort of datasets, but +really it is all one big database and these are individual entries within +that). + +Commands: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases doi_prefix:10.14469 type:article 'title:NSC*' author:"Imperial College High Performance Computing Service" --entity-json --limit 50 \ + | jq 'select(.release_type == "article")' -c \ + | pv -l \ + | fatcat-cli batch update release release_type=entry --description "Correct release_type for 'Revised Cambridge NCI database' entries" + # Got 158380 hits + # editgroup_mwuqpc5j3fhtjg5vxvr2xnitda + +Looks good, do the full batch (!): + + fatcat-cli search releases doi_prefix:10.14469 type:article 'title:NSC*' author:"Imperial College High Performance Computing Service" --entity-json --limit 160000 \ + | jq 'select(.release_type == "article")' -c \ + | pv -l \ + | fatcat-cli batch update release release_type=entry --description "Correct release_type for 'Revised Cambridge NCI database' entries" --auto-accept + # 158k 1:00:21 [43.7 /s] + +Off it goes! + +There are more patterns from this repository, but this is a good start. diff --git a/extra/bulk_edits/2022-02-09_plos_non_articles.md b/extra/bulk_edits/2022-02-09_plos_non_articles.md new file mode 100644 index 00000000..5deadf22 --- /dev/null +++ b/extra/bulk_edits/2022-02-09_plos_non_articles.md @@ -0,0 +1,69 @@ + +PLOS publishes a number of non-articles, and many are not correctly marked in +metadata. + +## Issue Images + + fatcat-cli search releases doi_prefix:10.1371 title:image --index-json -n0 | rg '10.1371/image.' | wc -l + # Got 1142 hits in 92ms + # 348 + + fatcat-cli search releases doi_prefix:10.1371 title:"issue image" --count + # 348 + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases doi_prefix:10.1371 title:"issue image" release_type:article-journal --entity-json -n 400 \ + | jq 'select(.release_type == "article-journal")' -c \ + | rg '10.1371/image.' \ + | head -n50 \ + | fatcat-cli batch update release release_type=graphic --description "PLoS Issue Images as type 'graphic'" + # Got 348 hits in 121ms + # editgroup_cq5cch7pmjglpehojhmza5hvxq + + # the rest + fatcat-cli search releases doi_prefix:10.1371 title:"issue image" release_type:article-journal --entity-json -n 400 \ + | jq 'select(.release_type == "article-journal")' -c \ + | rg '10.1371/image.' \ + | fatcat-cli batch update release release_type=graphic --description "PLoS Issue Images as type 'graphic'" --auto-accept + # Got 298 hits in 105ms + +## Non-PLOS DOI Releases + + !doi_prefix:10.1371 container_id:iznnn644szdwva7khyxqzc73bi + # 10 + +Some of these are "repo DOIs with `container_id`", some are DOAJ. The DOAJ ones +did not fuzzy-match mostly because of greek characters, and should be merged... +manually? In this case there are only a handful, but there will be more +elsewhere. + + fatcat-cli search releases title:"authors reply" 'container_id:*' 'doaj_id:*' --count + # 275 + + fatcat-cli search releases title:"authors reply" 'container_id:*' 'doaj_id:*' plos --count + # 5 + + fatcat-cli search releases '!doi_prefix:10.1371' '!pmid:*' '!doi:*' 'container_id:*' journal:plos 'doaj_id:*' --count + # 1511 + + fatcat-cli search releases '!doi_prefix:10.1371' '!pmid:*' '!doi:*' 'container_id:*' journal:plos 'doaj_id:*' '!title:correction' --count + # 35 + + fatcat-cli search releases '!doi_prefix:10.1371' 'container_id:*' journal:plos --count + # 2012 + +(note: the above run while in the process of removing a lot of "RWTH" repo DOIs) + +Ok, after the batch fixups: + + fatcat-cli search releases '!doi_prefix:10.1371' 'container_id:*' journal:plos --count + 1507 + + fatcat-cli search releases '!doi_prefix:10.1371' 'container_id:*' journal:plos '!doaj_id:*' --count + 4 + +Will fix these up manually. The DOAJ cleanups will be more involved... should +probably add a simple blocklist in DOAJ article importer to skip attempts. diff --git a/extra/bulk_edits/2022-02-09_repo_dois_with_containerid.md b/extra/bulk_edits/2022-02-09_repo_dois_with_containerid.md new file mode 100644 index 00000000..25f74db9 --- /dev/null +++ b/extra/bulk_edits/2022-02-09_repo_dois_with_containerid.md @@ -0,0 +1,200 @@ + +Some institutional repositories register DOIs for pre-prints with the metadata +for the version of record included, including an ISSN number. This results in +the release entities getting the `container_id` of the actual journal, and show +up in preservation dashboards, etc. + +## Columbia University + +Here is an example search query, showing two works, both marked today as "PLoS Medicine": + + https://fatcat.wiki/release/search?q=%22Contraceptive+use+among+adolescent+and+young+women+in+North+and+South+Kivu%2C+Democratic+Republic+of+the+Congo%3A+A+cross-sectional+population-based+survey%22&generic=1 + +Some count queries: + + fatcat-cli search releases doi_prefix:10.7916 doi_registrar:datacite 'container_id:*' release_stage:published --count + # 10870 + + fatcat-cli search releases doi_prefix:10.7916 doi_registrar:datacite 'container_id:*' release_stage:published --entity-json -n0 \ + | rg '"Columbia University"' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + > /dev/null + # 10.7k 0:09:39 + +So, most of these. + +Let's update these to `release_stage=submitted` and `container_id=`. + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases doi_prefix:10.7916 doi_registrar:datacite 'container_id:*' release_stage:published --entity-json --limit 50 \ + | jq 'select(.container_id != null)' -c \ + | rg '"Columbia University"' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + | fatcat-cli batch update release release_stage=submitted container_id= --description "Remove container linkage for Columbia University repository deposits" + # editgroup_grxwpieqvvenxfaxwojnud4lla + + # full auto + fatcat-cli search releases doi_prefix:10.7916 doi_registrar:datacite 'container_id:*' release_stage:published --entity-json --limit 11000 \ + | jq 'select(.container_id != null)' -c \ + | rg '"Columbia University"' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + | fatcat-cli batch update release release_stage=submitted container_id= --description "Remove container linkage for Columbia University repository deposits" --auto-accept + +Also created a patch for fatcat datacite importer to not link these in the future. + +## "RWTH Publications" + + https://fatcat.wiki/release/search?q=%22Predicting+survival+from+colorectal+cancer+histology+slides+using+deep+learning%3A+A+retrospective+multicenter+study%22&generic=1 + + doi_prefix:10.18154 + + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' --count + # 11364 + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite --count + # 11364 + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite affiliation:RWTH --count + # 6257 + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite --entity-json -n0 \ + | rg 'RWTH' \ + | rg '10.18154/rwth-' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + > /dev/null + # many/all? at least 5k, cut off there + +Ok, do updates: + + # start small + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite affiliation:RWTH --entity-json -n50 \ + | jq 'select(.container_id != null)' -c \ + | rg 'RWTH' \ + | rg '10.18154/rwth-' \ + | rg '"IsVariantFormOf"' \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for RWTH repository deposits" + # Got 6257 hits in 1087ms + # editgroup_cb2vdn7npfg63muppawbhzrhjq + + # do the rest + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite affiliation:RWTH --entity-json -n12000 \ + | jq 'select(.container_id != null)' -c \ + | rg 'RWTH' \ + | rg '10.18154/rwth-' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for RWTH repository deposits" --auto-accept + # Got 6207 hits in 696ms + # 6.00k 0:16:37 [6.01 /s] + +After that process, there were still many mis-matched DOIs, so relaxing +constraints. This repository *does* contain a bunch of publications from RWTH +itself (books, conference series, etc), so don't want to update everything. + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite '!journal:RWTH' '!container_id:m2cho7mmmbgxzdpfz7cmjgegbu' --count + # 3946 + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite '!journal:RWTH' '!container_id:m2cho7mmmbgxzdpfz7cmjgegbu' --entity-json -n6000 \ + | jq 'select(.container_id != null)' -c \ + | rg 'RWTH' \ + | rg '10.18154/rwth-20' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for RWTH repository deposits" --auto-accept + # Got 3946 hits in 77ms + +Specifically, some more PLOS ones: + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite '!journal:RWTH' '!container_id:m2cho7mmmbgxzdpfz7cmjgegbu' journal:plos --count + # 338 + + fatcat-cli search releases doi_prefix:10.18154 'container_id:*' doi_registrar:datacite '!journal:RWTH' '!container_id:m2cho7mmmbgxzdpfz7cmjgegbu' 'journal:plos' --entity-json -n500 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.18154/rwth-' \ + | pv -l \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for RWTH repository deposits" --auto-accept + # Got 338 hits in 33ms + +## DESY Pre-Print Server (PUBDB) + + https://fatcat.wiki/release/search?q=%22viral+phosphatase+adaptor+that+promotes+herpes+simplex+virus+replication+and+spread%22+type%3Aarticle-journal+%21title%3Acorrection + + fatcat-cli search releases doi_prefix:10.3204 'container_id:*' doi_registrar:datacite publisher:DESY --count + # 313 + + fatcat-cli search releases doi_prefix:10.3204 'container_id:*' doi_registrar:datacite --count + # 6679 + + fatcat-cli search releases doi_prefix:10.3204 'container_id:*' doi_registrar:datacite --entity-json -n7000 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.3204/(pubdb|phppubdb)-' \ + | rg '"IsVariantFormOf"' \ + | pv -l \ + > /dev/null + # at least hundreds + + # start small + fatcat-cli search releases doi_prefix:10.3204 'container_id:*' doi_registrar:datacite --entity-json -n50 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.3204/(pubdb|phppubdb)-' \ + | rg '"IsVariantFormOf"' \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for DESY repository deposits" + # Got 6679 hits in 368ms + # editgroup_vhcxvqjyinhxfplkoqjtprnxj4 + + fatcat-cli search releases doi_prefix:10.3204 'container_id:*' doi_registrar:datacite --entity-json -n7000 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.3204/(pubdb|phppubdb)-' \ + | rg '"IsVariantFormOf"' \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for DESY repository deposits" --auto-accept + + +## Kluedo: Publication Server of University of Kaiserslautern + +doi:10.26204/kluedo/6163 + + fatcat-cli search releases doi_prefix:10.26204 'container_id:*' --count + # 7 + +Whew, an easy one! + + fatcat-cli search releases doi_prefix:10.26204 'container_id:*' --entity-json -n50 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.26204/kluedo/' \ + | fatcat-cli batch update release release_stage=submitted container_id= --description "Remove container linkage for 'Kluedo' repository deposits" + # Got 7 hits in 20ms + # editgroup_tmyyg4yl7vbg7mveyfcdxhptfu + +## Universitat Bayreuth + + doi:10.15495/epub_ubt_00005577 + + fatcat-cli search releases doi_prefix:10.15495 'container_id:*' --count + # 554 + +Great, also not very large. + + # start small + fatcat-cli search releases doi_prefix:10.15495 'container_id:*' --entity-json -n50 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.15495/epub_ubt_' \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for University of Bayreuth repository deposits" + # 554 + # editgroup_6oubgez7jrfabprdckijvijsa4 + + fatcat-cli search releases doi_prefix:10.15495 'container_id:*' --entity-json -n600 \ + | jq 'select(.container_id != null)' -c \ + | rg '10.15495/epub_ubt_' \ + | fatcat-cli batch update release container_id= --description "Remove container linkage for University of Bayreuth repository deposits" --auto-accept + # did a variant with `publisher:Bayreuth`, which only matched a single release + # Got 503 hits in 310ms + +Could also have filtered on publisher "University of Bayreuth", in the post-fetch part. diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 8e5c5284..278dc1d8 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -11,6 +11,11 @@ This file should not turn in to a TODO list! ## 2022-02 +- removed `container_id` linkage for some Datacite DOI releases which are + respository deposits of published papers (eg, PLOS OA papers mirrored in + IRs). a few tens of thousands of releases. +- just over 150k "Revised Cambridge NCI database" chemical database DOIs + updated from 'article' to 'entry' - a few tens of thousands of Zenodo releases marked as spam (and `stub`) - tens of thousands of no-longer-used Crossref DOIs marked as `stub` - hundreds of test/dummy/null releases marked as `stub` -- cgit v1.2.3