diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/chocula.py | 34 | 
1 files changed, 22 insertions, 12 deletions
| diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index d5d1cce8..d08a98e0 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -24,7 +24,7 @@ class ChoculaImporter(EntityImporter):              **kwargs)      def want(self, raw_record): -        if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'): +        if not raw_record.get('ident') and not raw_record.get('_known_issnl'):              self.counts['skip-unknown-new-issnl'] += 1              return False          if raw_record.get('issnl') and raw_record.get('name'): @@ -53,7 +53,8 @@ class ChoculaImporter(EntityImporter):          extra = dict()          for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', -                  'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'): +                  'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages', +                  'ia', 'scielo', 'kbart', 'publisher_type', 'platform'):              if row['extra'].get(k):                  extra[k] = row['extra'][k] @@ -115,30 +116,39 @@ class ChoculaImporter(EntityImporter):              do_update = True          if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):              do_update = True -        for k in ('ezb', 'szczepanski', 'doaj'): +        for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):              if ce.extra.get(k) and not existing.extra.get(k):                  do_update = True +        for k in ('kbart', 'ia', 'doaj'): +            # always update these fields if not equal (chocula override) +            if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): +                do_update = True          if ce.publisher and not existing.publisher:              do_update = True          if ce.wikidata_qid and not existing.wikidata_qid:              do_update = True          if do_update: -            existing.wikidata_qid = ce.wikidata_qid -            existing.publisher = ce.publisher +            existing.wikidata_qid = existing.wikidata_qid or ce.wikidata_qid +            existing.publisher = existing.publisher or ce.publisher              existing.container_type = existing.container_type or ce.container_type              for k in ('urls', 'webarchive_urls'): -                # update, which might clobber, but won't remove +                # always update if available. should probably make this more +                # careful/subtle in the future! +                # note: in some cases we might *want* to remove existing (if +                # all URLs found to be bad), but being +                # conservative/inclusionist for now                  if ce.extra.get(k):                      existing.extra[k] = ce.extra.get(k, []) -                # note: in some cases we might *want* to clobber existing (if -                # all URLs found to be bad), but being conservative for now so -                # we don't clobber human edits -            for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', -                      'szczepanski', 'doaj'): -                # update/overwrite, but don't remove any existing value +            for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia', +                      'scielo', 'kbart', 'publisher_type', 'platform'): +                # always update (chocula over-rides)                  if ce.extra.get(k):                      existing.extra[k] = ce.extra[k] +            for k in ('issne', 'issnp', 'country'): +                # only include if not set (don't clobber human edits) +                if ce.extra.get(k) and not existing.extra.get(k): +                    existing.extra[k] = ce.extra[k]              if ce.extra.get('languages'):                  if not existing.extra.get('languages'):                      existing.extra['languages'] = ce.extra['languages'] | 
