aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-04 12:39:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-04 12:39:35 -0700
commitb675b60a183b57e276ba7044bd9ece0130f2c457 (patch)
tree909a2590e748d07eb721142515763b142ec60859
parent990af8b9b8dab3bcfde3f93e21d89b3a2f41dcd8 (diff)
downloadfatcat-b675b60a183b57e276ba7044bd9ece0130f2c457.tar.gz
fatcat-b675b60a183b57e276ba7044bd9ece0130f2c457.zip
chocula import update tweaks
-rw-r--r--python/fatcat_tools/importers/chocula.py24
1 files changed, 14 insertions, 10 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 356ffe42..d08a98e0 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -116,11 +116,11 @@ class ChoculaImporter(EntityImporter):
do_update = True
if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
do_update = True
- for k in ('ezb', 'szczepanski', 'doaj', 'publisher_type', 'platform'):
+ for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):
if ce.extra.get(k) and not existing.extra.get(k):
do_update = True
for k in ('kbart', 'ia', 'doaj'):
- # always update with these fields
+ # always update these fields if not equal (chocula override)
if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
do_update = True
if ce.publisher and not existing.publisher:
@@ -133,18 +133,22 @@ class ChoculaImporter(EntityImporter):
existing.publisher = existing.publisher or ce.publisher
existing.container_type = existing.container_type or ce.container_type
for k in ('urls', 'webarchive_urls'):
- # update, which might clobber, but won't remove
+ # always update if available. should probably make this more
+ # careful/subtle in the future!
+ # note: in some cases we might *want* to remove existing (if
+ # all URLs found to be bad), but being
+ # conservative/inclusionist for now
if ce.extra.get(k):
existing.extra[k] = ce.extra.get(k, [])
- # note: in some cases we might *want* to clobber existing (if
- # all URLs found to be bad), but being conservative for now so
- # we don't clobber human edits
- for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',
- 'szczepanski', 'doaj', 'ia', 'scielo', 'kbart',
- 'publisher_type', 'platform'):
- # update/overwrite, but don't remove any existing value
+ for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia',
+ 'scielo', 'kbart', 'publisher_type', 'platform'):
+ # always update (chocula over-rides)
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
+ for k in ('issne', 'issnp', 'country'):
+ # only include if not set (don't clobber human edits)
+ if ce.extra.get(k) and not existing.extra.get(k):
+ existing.extra[k] = ce.extra[k]
if ce.extra.get('languages'):
if not existing.extra.get('languages'):
existing.extra['languages'] = ce.extra['languages']