aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-09-03 15:06:24 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-09-03 15:06:24 -0700
commit3d340705c08c855d6eb89f2f4e491601613c6a0d (patch)
tree25502d9ff0bab0625779d61db5b0d65553e7df5a /python
parent466953355f3e2a18cbdb93ca39d077103703f049 (diff)
downloadfatcat-3d340705c08c855d6eb89f2f4e491601613c6a0d.tar.gz
fatcat-3d340705c08c855d6eb89f2f4e491601613c6a0d.zip
last chocula import behavior tweaks
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/chocula.py24
1 files changed, 21 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 25edccee..d7044ff4 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -46,9 +46,14 @@ class ChoculaImporter(EntityImporter):
# Name is required (by schema)
return None
+ name = name.strip()
+
if name.endswith(', Proceedings of the'):
name = "Proceedings of the " + name.split(',')[0]
+ if name.endswith('.'):
+ name = name[:-1]
+
extra = dict()
for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'):
@@ -106,8 +111,13 @@ class ChoculaImporter(EntityImporter):
do_update = False
if not existing.extra:
existing.extra = dict()
- if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+ if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
do_update = True
+ if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
+ do_update = True
+ for k in ('ezb', 'szczepanski', 'doaj'):
+ if ce.extra.get(k) and not existing.extra.get(k):
+ do_update = True
if ce.publisher and not existing.publisher:
do_update = True
if ce.wikidata_qid and not existing.wikidata_qid:
@@ -117,8 +127,16 @@ class ChoculaImporter(EntityImporter):
existing.wikidata_qid = ce.wikidata_qid
existing.publisher = ce.publisher
existing.container_type = existing.container_type or ce.container_type
- for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
- 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj'):
+ for k in ('urls', 'webarchive_urls'):
+ # update, or clobber/remove any existing values. often
+ # want/need to remove dead URLs
+ if ce.extra.get(k):
+ existing.extra[k] = ce.extra.get(k, [])
+ elif k in existing.extra.keys():
+ existing.extra.pop(k)
+ for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',
+ 'szczepanski', 'doaj'):
+ # update, but don't remove any existing value
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
if ce.extra.get('languages'):