aboutsummaryrefslogtreecommitdiffstats
path: root/extra/bulk_edits
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-02-04 20:08:25 -0800
committerBryan Newbold <bnewbold@robocracy.org>2022-02-04 20:08:25 -0800
commit752be5426178f4730326c6ca89d1fc20135a2509 (patch)
tree4f36010504389fa6d2594210e232e00617d69087 /extra/bulk_edits
parentd0b9e988aa3a5ee3cc44ab1cb6f86b524c71d056 (diff)
downloadfatcat-752be5426178f4730326c6ca89d1fc20135a2509.tar.gz
fatcat-752be5426178f4730326c6ca89d1fc20135a2509.zip
bulk metadata edit log
Diffstat (limited to 'extra/bulk_edits')
-rw-r--r--extra/bulk_edits/2022-02-04_deleted_dois.md154
-rw-r--r--extra/bulk_edits/2022-02-04_zenodo_spam.md64
-rw-r--r--extra/bulk_edits/CHANGELOG.md5
3 files changed, 223 insertions, 0 deletions
diff --git a/extra/bulk_edits/2022-02-04_deleted_dois.md b/extra/bulk_edits/2022-02-04_deleted_dois.md
new file mode 100644
index 00000000..83c7d697
--- /dev/null
+++ b/extra/bulk_edits/2022-02-04_deleted_dois.md
@@ -0,0 +1,154 @@
+
+## Wild Volume/Issue Numbers
+
+ fatcat-cli search release --count 'volume:99999 author:xxxxxxxxxx doi:* !release_type:stub'
+ # 37
+
+A number of these have duplicated PMID/PMCID
+
+Should update with:
+
+ release_type:stub release_stage: pmid: pmcid: wikidata_qid: volume: issue: pages:
+
+ export FATCAT_AUTH_WORKER_CLEANUP=[...]
+ export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP
+ fatcat-cli search releases 'volume:99999 author:xxxxxxxxxx doi:* !release_type:stub' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub volume= issue= container_id= pages= pmid= pmcid= wikidata_qid= --description "Cleanup of de-registered/stub Crossref DOIs"
+ # editgroup_exitdv37d5h5zlhmnc6bkwpz6a
+
+This small batch seems to just be partial/bad metadata, but real releases:
+
+ fatcat-cli search release --count 'volume:9999 issue:9999 container_id:4ozjmpq3dvd2xjdnavdvvq3bam'
+ # 7
+
+ fatcat-cli search releases 'volume:9999 issue:9999 container_id:4ozjmpq3dvd2xjdnavdvvq3bam' --entity-json --limit 50 \
+ | jq 'select(.volume == "9999")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub volume= issue= container_id= pages= pmid= pmcid= wikidata_qid= --description "Cleanup of bad volume/issue numbers"
+ # editgroup_bba357rix5g4znbyyz5pu4tjki
+
+Oops, that was too agressive, not merging.
+
+ fatcat-cli search releases 'volume:9999 issue:9999 container_id:4ozjmpq3dvd2xjdnavdvvq3bam' --entity-json --limit 50 \
+ | jq 'select(.volume == "9999")' -c \
+ | pv -l \
+ | fatcat-cli batch update release volume= issue= --description "Cleanup of bad volume/issue numbers"
+ # editgroup_vablvgsdpvexvf55zerugkcm6q
+
+Did some other manual cleanups.
+
+These are just bad metadata, not stubs:
+
+ fatcat-cli search release 'volume:999 issue:999' --count
+ # 456
+
+ # first limit 50 with no auto-merge, then ran the remainder
+ fatcat-cli search releases 'volume:999 issue:999' --entity-json --limit 50 \
+ | jq 'select(.volume == "999")' -c \
+ | pv -l \
+ | fatcat-cli batch update release volume= issue= --description "Cleanup of bad volume/issue numbers"
+ # editgroup_xsmvljqware4reixxw7xhuywqq
+
+ # ok, now auto for the rest
+ fatcat-cli search releases 'volume:999 issue:999' --entity-json --limit 500 \
+ | jq 'select(.volume == "999")' -c \
+ | pv -l \
+ | fatcat-cli batch update release volume= issue= --description "Cleanup of bad volume/issue numbers" --auto-accept
+
+## "CrossRef Listing Of Deleted DOIs"
+
+42 releases have the same container, which was misnamed: `container_5hsepvqrxrakvcg4to77yuhbdi`
+
+Updated that container manually.
+
+ fatcat-cli search releases 'publisher:"Test accounts" journal:"CrossRef Listing of Deleted DOIs" doi:* !release_type:stub' --count
+ # 52773
+
+ # start small
+ fatcat-cli search releases 'publisher:"Test accounts" journal:"CrossRef Listing of Deleted DOIs" doi:* !release_type:stub' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub volume= issue= container_id= pages= pmid= pmcid= wikidata_qid= --description "Cleanup of de-registered/stub Crossref DOIs"
+ # editgroup_hhdr2ptknjemrjwx7kum6a4c6y
+
+Looks good, though not really any point in removing volume/issue/pages if we
+are removing `container_id`, so I won't remove that.
+
+ fatcat-cli search releases 'publisher:"Test accounts" journal:"CrossRef Listing of Deleted DOIs" doi:* !release_type:stub' --entity-json --limit 53000 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub container_id= pmid= pmcid= wikidata_qid= --description "Cleanup of de-registered/stub Crossref DOIs" --auto-accept
+
+
+## "Test Papers"
+
+ fatcat-cli search releases 'title:"test paper" title:ignore author:Alejandro container_id:tol7woxlqjeg5bmzadeg6qrg3e' --count
+ # 38
+
+ fatcat-cli search releases 'title:"test paper" title:ignore author:Alejandro container_id:tol7woxlqjeg5bmzadeg6qrg3e' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub --description "Mark 'testing' / 'debug' works as stubs"
+ # editgroup_ikgq6flyhjds7mxe3pig3pvduu
+
+ fatcat-cli search releases 'title:ABCDEF author:EFGH container_id:"dem5zlrvj5fozg4qkmp46jeb4a" !release_type:stub' --count
+ 104
+
+ fatcat-cli search releases 'title:ABCDEF author:EFGH container_id:"dem5zlrvj5fozg4qkmp46jeb4a" !release_type:stub' --entity-json --limit 200 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub --description "Mark 'testing' / 'debug' works as stubs"
+ # editgroup_x2ki7xw35nablf3trjt43zxhpm
+ # editgroup_2kinca3wgbgujiu634l6g2bxpq
+ # editgroup_utkdkgvfcvbu5nr4ebiz35a5m4
+
+ fatcat-cli search releases 'doi_prefix:10.1254 title:ABCDEF author:EFGH !type:stub' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub volume= release_stage= --description "Mark 'testing' / 'debug' works as stubs"
+ # editgroup_ge26fv3gqncvde47peu67dwkn4
+
+
+## Bogus DOIs (10.5555)
+
+ fatcat-cli search releases 'doi_prefix:10.5555 pmcid:* !type:stub' --count
+ 133
+
+ fatcat-cli search releases 'doi_prefix:10.5555 pmcid:* !type:stub container_id:w46j4of25bd4bjrfy4botn5ezi' --count
+ 119
+
+These seem to all be bogus, never-registered DOIs. Going to remove them from the release entities.
+
+ fatcat-cli search releases 'doi_prefix:10.5555 pmcid:* !type:stub container_id:w46j4of25bd4bjrfy4botn5ezi' --entity-json --limit 120 \
+ | pv -l \
+ | fatcat-cli batch update release doi= --description "Remove some non-existant DOIs from PMCID works"
+ # editgroup_5xzb5d2fh5goremlrrwtlp372i
+ # editgroup_rkcgwcideza4vguje2vonsyeua
+ # editgroup_nfh7yg4l75fjdbjsvle2otz4ee
+
+## PsycEXTRA
+
+ fatcat-cli search releases 'journal:PsycEXTRA publisher:"Test accounts" doi_registrar:crossref !type:stub' --count
+ 13354
+
+Not sure what the deal is. These seem to all have been de-registered? But not
+confident enough to run import. We have many of these crawled and archived.
+
+## null/null DOIs
+
+ fatcat-cli search releases 'title:null author:null !type:stub' --count
+ 16
+
+These are not all necessarily deleted. Went through manually. Many seemed to be withdrawn, not stubs.
+
+## Known Crossref Test Stuff
+
+For now, not going to remove or mark these.
+
+"The Journal Of Test Deposits": https://fatcat.wiki/container/7wqkwve2ezbtfn7gkorcuvjd3m
+
+"Journal of Psychoceramics": https://fatcat.wiki/container/u6q4326uzjak3jx7qjmj7742ea
+
+"Annals of Psychoceramics B": https://fatcat.wiki/container/ywwmljqajvam7gzhwpjmvahs5y
diff --git a/extra/bulk_edits/2022-02-04_zenodo_spam.md b/extra/bulk_edits/2022-02-04_zenodo_spam.md
new file mode 100644
index 00000000..a95f5180
--- /dev/null
+++ b/extra/bulk_edits/2022-02-04_zenodo_spam.md
@@ -0,0 +1,64 @@
+
+## Cleanup Zenodo Spam
+
+ fatcat-cli search releases 'year:2021 title:"DOWNLOAD MP3:" doi_prefix:10.5281 !journal:* in_ia:false' --count
+ # 29653
+
+ fatcat-cli search releases 'year:2021 title:"DOWNLOAD MP3:" title:"album download" doi_prefix:10.5281 !journal:* in_ia:false' --count
+ # 29043
+
+Let's nuke 'em:
+
+ # start small, not automatic
+ fatcat-cli search releases 'year:2021 title:"DOWNLOAD MP3:" title:"album download" doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such"
+ # editgroup_yizvqtz24bfv3jd6vmawiqnojm
+
+ # ok, scale it up!
+ fatcat-cli search releases 'year:2021 title:"DOWNLOAD MP3:" title:"album download" doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --entity-json --limit 30000 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such" --auto-accept
+
+Another pattern:
+
+ fatcat-cli search releases 'year:2021 title:"download album" title:"zip mp3" author:download doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --count
+ # 14376
+
+ fatcat-cli search releases 'year:2021 title:"download album" title:"zip mp3" author:download doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --entity-json --limit 30000 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such" --auto-accept
+
+Did some manual patterns.
+
+Another pattern; checking manually all looks like spam:
+
+ fatcat-cli search releases 'title:"live stream free" doi_prefix:10.5281 !type:stub' --count
+ # 176
+
+ fatcat-cli search releases 'title:"live stream free" doi_prefix:10.5281 !type:stub' --entity-json --limit 200 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such" --auto-accept
+ # done
+
+Another large pattern:
+
+ fatcat-cli search releases 'year:2021 title:"Full Album Download" title:mp3 author:download doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --count
+ # 14800
+
+ fatcat-cli search releases 'year:2021 title:"Full Album Download" title:mp3 author:download doi_prefix:10.5281 !journal:* in_ia:false !release_type:stub' --entity-json --limit 15000 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such" --auto-accept
+
+Small pattern:
+
+ fatcat-cli search releases 'gomovies !release_type:stub' --entity-json --limit 50 \
+ | jq 'select(.release_type != "stub")' -c \
+ | pv -l \
+ | fatcat-cli batch update release release_type=stub withdrawn_status=spam --description "Mark Zenodo spam as such"
+ # editgroup_fzumcytmljfebldd5az643wqmi
+
diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md
index 6156721c..8e5c5284 100644
--- a/extra/bulk_edits/CHANGELOG.md
+++ b/extra/bulk_edits/CHANGELOG.md
@@ -9,6 +9,11 @@ this file should probably get merged into the guide at some point.
This file should not turn in to a TODO list!
+## 2022-02
+
+- a few tens of thousands of Zenodo releases marked as spam (and `stub`)
+- tens of thousands of no-longer-used Crossref DOIs marked as `stub`
+- hundreds of test/dummy/null releases marked as `stub`
## 2021-11