diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-12 20:43:21 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-12 20:43:21 -0700 |
commit | e9525e5a9fce2927048e10716fe86548e91824c6 (patch) | |
tree | f23e43f035a12d51f75e21de03de84792909f276 | |
parent | 1975b3267aa20925bb28598a57517a530458b538 (diff) | |
download | fatcat-e9525e5a9fce2927048e10716fe86548e91824c6.tar.gz fatcat-e9525e5a9fce2927048e10716fe86548e91824c6.zip |
cleanup: DOAJ missing container_id
-rw-r--r-- | extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md | 38 | ||||
-rw-r--r-- | extra/bulk_edits/CHANGELOG.md | 3 |
2 files changed, 41 insertions, 0 deletions
diff --git a/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md new file mode 100644 index 00000000..b17e799d --- /dev/null +++ b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md @@ -0,0 +1,38 @@ + +There is a batch of about 480 releases with DOAJ identifiers but no container +linkage. These seem to all be from the same actual container: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 486 + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --index-json -n 0 | jq .containe + # Got 486 hits in 138ms + # "Revista de Sistemas, Cibernética e Informática" + +Edit pipeline: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 50 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" + # editgroup_g2zrm3wkmneoldtqfxpbkaoeh4 + +Looks good, merged. + + # full auto + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 500 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" --auto-accept + +Verify: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 0 + +Also planning to have DOAJ article importer 'skip' in the future for articles +with no `container_id` match. diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 9acf8ccd..f7b9e536 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -13,6 +13,9 @@ This file should not turn in to a TODO list! Ran a journal-level metadata update, using chocula. +Cleaned up just under 500 releases with missing `container_id` from an older +DOAJ article import. + ## 2022-04 |