diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-09-03 15:06:24 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-09-03 15:06:24 -0700 |
commit | 3d340705c08c855d6eb89f2f4e491601613c6a0d (patch) | |
tree | 25502d9ff0bab0625779d61db5b0d65553e7df5a | |
parent | 466953355f3e2a18cbdb93ca39d077103703f049 (diff) | |
download | fatcat-3d340705c08c855d6eb89f2f4e491601613c6a0d.tar.gz fatcat-3d340705c08c855d6eb89f2f4e491601613c6a0d.zip |
last chocula import behavior tweaks
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 24 |
1 files changed, 21 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 25edccee..d7044ff4 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -46,9 +46,14 @@ class ChoculaImporter(EntityImporter): # Name is required (by schema) return None + name = name.strip() + if name.endswith(', Proceedings of the'): name = "Proceedings of the " + name.split(',')[0] + if name.endswith('.'): + name = name[:-1] + extra = dict() for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'): @@ -106,8 +111,13 @@ class ChoculaImporter(EntityImporter): do_update = False if not existing.extra: existing.extra = dict() - if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): + if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): do_update = True + if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): + do_update = True + for k in ('ezb', 'szczepanski', 'doaj'): + if ce.extra.get(k) and not existing.extra.get(k): + do_update = True if ce.publisher and not existing.publisher: do_update = True if ce.wikidata_qid and not existing.wikidata_qid: @@ -117,8 +127,16 @@ class ChoculaImporter(EntityImporter): existing.wikidata_qid = ce.wikidata_qid existing.publisher = ce.publisher existing.container_type = existing.container_type or ce.container_type - for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', - 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj'): + for k in ('urls', 'webarchive_urls'): + # update, or clobber/remove any existing values. often + # want/need to remove dead URLs + if ce.extra.get(k): + existing.extra[k] = ce.extra.get(k, []) + elif k in existing.extra.keys(): + existing.extra.pop(k) + for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', + 'szczepanski', 'doaj'): + # update, but don't remove any existing value if ce.extra.get(k): existing.extra[k] = ce.extra[k] if ce.extra.get('languages'): |