From 4ed7afa928cde58b3f2aa3283ce61e46e18f1554 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 29 Jul 2022 18:40:06 -0700 Subject: chocula update notes --- extra/bulk_edits/2022-07-29_chocula.md | 47 ++++++++++++++++++++++++++++++++++ extra/bulk_edits/CHANGELOG.md | 3 +++ 2 files changed, 50 insertions(+) create mode 100644 extra/bulk_edits/2022-07-29_chocula.md diff --git a/extra/bulk_edits/2022-07-29_chocula.md b/extra/bulk_edits/2022-07-29_chocula.md new file mode 100644 index 00000000..1f6f36ca --- /dev/null +++ b/extra/bulk_edits/2022-07-29_chocula.md @@ -0,0 +1,47 @@ + +Periodic import of chocula metadata updates. + +In particular, expecting a bunch of `publisher_type` updates. + +Going to explicitly not do DOAJ-only updates this time around. That is, if +container would have been updated, then new DOAJ 'extra' metadata will pass +through. But don't only update entity for this reason. This is to reduce churn +based only on the `as-of` key. Should probably change the behavior next time +around. + +## Prod Import + + date + # Sat Jul 30 01:18:41 UTC 2022 + + git log -n1 + # 5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 + + diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py + index 38802bcb..762c44dd 100644 + --- a/python/fatcat_tools/importers/chocula.py + +++ b/python/fatcat_tools/importers/chocula.py + @@ -139,7 +139,7 @@ class ChoculaImporter(EntityImporter): + if ce.extra.get("publisher_type") and not ce.extra.get("publisher_type"): + # many older containers were missing this metadata + do_update = True + - for k in ("kbart", "ia", "doaj"): + + for k in ("kbart", "ia"): + # always update these fields if not equal (chocula override) + if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): + do_update = True + + export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...] + shuf -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 100, 'exists': 98, 'exists-skip-update': 98, 'update': 2, 'skip': 0, 'insert': 0}) + + shuf -n1000 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 1000, 'exists': 986, 'exists-skip-update': 986, 'update': 12, 'insert': 2, 'skip': 0}) + +Huh, not seeing any `publisher_type` changes, which I was expecting more of. + + time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 188506, 'exists': 185808, 'exists-skip-update': 185806, 'update': 2495, 'insert': 203, 'exists-by-issnl': 2, 'skip': 0}) + +Looking through the changelog, some did through with `publisher_type` updates. +Whew! diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 3c7be454..716c95d6 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -31,6 +31,9 @@ Cleaned up about a thousand containers with incorrect `publisher_type`, based on current publisher name. Further updates will populate after the next chocula import. +Ran a second batch of journal-level metadata updates, from chocula, resulting +in a couple thousand updated entities. + ## 2022-04 -- cgit v1.2.3