From e9525e5a9fce2927048e10716fe86548e91824c6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 12 Jul 2022 20:43:21 -0700 Subject: cleanup: DOAJ missing container_id --- ...2022-07-12_cleanup_doaj_missing_container_id.md | 38 ++++++++++++++++++++++ extra/bulk_edits/CHANGELOG.md | 3 ++ 2 files changed, 41 insertions(+) create mode 100644 extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md diff --git a/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md new file mode 100644 index 00000000..b17e799d --- /dev/null +++ b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md @@ -0,0 +1,38 @@ + +There is a batch of about 480 releases with DOAJ identifiers but no container +linkage. These seem to all be from the same actual container: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 486 + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --index-json -n 0 | jq .containe + # Got 486 hits in 138ms + # "Revista de Sistemas, Cibernética e Informática" + +Edit pipeline: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 50 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" + # editgroup_g2zrm3wkmneoldtqfxpbkaoeh4 + +Looks good, merged. + + # full auto + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 500 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" --auto-accept + +Verify: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 0 + +Also planning to have DOAJ article importer 'skip' in the future for articles +with no `container_id` match. diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 9acf8ccd..f7b9e536 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -13,6 +13,9 @@ This file should not turn in to a TODO list! Ran a journal-level metadata update, using chocula. +Cleaned up just under 500 releases with missing `container_id` from an older +DOAJ article import. + ## 2022-04 -- cgit v1.2.3