From 6bc3e4bbcdf9db420aab5a712a716c6c6a5bd6d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Jul 2022 14:26:26 -0700 Subject: container publisher_type cleanup notes --- extra/bulk_edits/CHANGELOG.md | 4 ++ extra/cleanups/container_publisher_type.md | 100 +++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 extra/cleanups/container_publisher_type.md diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 732cbb2f..3c7be454 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -27,6 +27,10 @@ more than a year. Imported at least 400 new dblp containers, and an unknown number of new dblp release entities. +Cleaned up about a thousand containers with incorrect `publisher_type`, based +on current publisher name. Further updates will populate after the next chocula +import. + ## 2022-04 diff --git a/extra/cleanups/container_publisher_type.md b/extra/cleanups/container_publisher_type.md new file mode 100644 index 00000000..dba800d3 --- /dev/null +++ b/extra/cleanups/container_publisher_type.md @@ -0,0 +1,100 @@ + +A bunch of MDPI journals are incorrectly listed as 'longtail'. + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --count + # 245 + +Because this is 'extra' metadata, need a little python script to change the +metadata (fatcat-cli doesn't have this feature yet): + + import sys + import json + + publisher_type = sys.argv[1].strip().lower() + #print(publisher_type, file=sys.stderr) + + for line in sys.stdin: + if not line.strip(): + continue + container = json.loads(line) + container["extra"]["publisher_type"] = publisher_type + print(json.dumps(container)) + +Run some cleanups: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 50 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" + # editgroup_oum6mnkl2rbn3jaua4a2gdlj5q + +Looks good, run the rest: + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Some more cleanup patterns: + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --count + # 84 + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --count + # 47 + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "commercial")' -c \ + | python3 ./container_publisher_type.py commercial \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 56 + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 98 + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 37 + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 558 + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 28 + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Overall, around a thousand containers updated. Changes to releases will not be +reflected until they are re-indexed. -- cgit v1.2.3