From 3d340705c08c855d6eb89f2f4e491601613c6a0d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 3 Sep 2019 15:06:24 -0700
Subject: last chocula import behavior tweaks

---
 python/fatcat_tools/importers/chocula.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 25edccee..d7044ff4 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -46,9 +46,14 @@ class ChoculaImporter(EntityImporter):
             # Name is required (by schema)
             return None
 
+        name = name.strip()
+
         if name.endswith(',  Proceedings of the'):
             name = "Proceedings of the " + name.split(',')[0]
 
+        if name.endswith('.'):
+            name = name[:-1]
+
         extra = dict()
         for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
                   'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'):
@@ -106,8 +111,13 @@ class ChoculaImporter(EntityImporter):
         do_update = False
         if not existing.extra:
             existing.extra = dict()
-        if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+        if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
             do_update = True
+        if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
+            do_update = True
+        for k in ('ezb', 'szczepanski', 'doaj'):
+            if ce.extra.get(k) and not existing.extra.get(k):
+                do_update = True
         if ce.publisher and not existing.publisher:
             do_update = True
         if ce.wikidata_qid and not existing.wikidata_qid:
@@ -117,8 +127,16 @@ class ChoculaImporter(EntityImporter):
             existing.wikidata_qid = ce.wikidata_qid
             existing.publisher = ce.publisher
             existing.container_type = existing.container_type or ce.container_type
-            for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
-                      'sherpa_romeo', 'ezb', 'szczepanski', 'doaj'):
+            for k in ('urls', 'webarchive_urls'):
+                # update, or clobber/remove any existing values. often
+                # want/need to remove dead URLs
+                if ce.extra.get(k):
+                    existing.extra[k] = ce.extra.get(k, [])
+                elif k in existing.extra.keys():
+                    existing.extra.pop(k)
+            for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',
+                      'szczepanski', 'doaj'):
+                # update, but don't remove any existing value
                 if ce.extra.get(k):
                     existing.extra[k] = ce.extra[k]
             if ce.extra.get('languages'):
-- 
cgit v1.2.3