From f81069f4cb126af65f2e2fe08cde44077eea75e7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Aug 2020 10:36:58 -0700 Subject: fix key name mismatch in chocula importer chocula 'export-fatcat' uses 'ident', not 'fatcat_ident' --- python/fatcat_tools/importers/chocula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index d5d1cce8..1683c500 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -24,7 +24,7 @@ class ChoculaImporter(EntityImporter): **kwargs) def want(self, raw_record): - if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'): + if not raw_record.get('ident') and not raw_record.get('_known_issnl'): self.counts['skip-unknown-new-issnl'] += 1 return False if raw_record.get('issnl') and raw_record.get('name'): -- cgit v1.2.3 From 990af8b9b8dab3bcfde3f93e21d89b3a2f41dcd8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Aug 2020 10:38:10 -0700 Subject: more update keys and cases for chocula importer --- python/fatcat_tools/importers/chocula.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 1683c500..356ffe42 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -53,7 +53,8 @@ class ChoculaImporter(EntityImporter): extra = dict() for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', - 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'): + 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages', + 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'): if row['extra'].get(k): extra[k] = row['extra'][k] @@ -115,17 +116,21 @@ class ChoculaImporter(EntityImporter): do_update = True if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): do_update = True - for k in ('ezb', 'szczepanski', 'doaj'): + for k in ('ezb', 'szczepanski', 'doaj', 'publisher_type', 'platform'): if ce.extra.get(k) and not existing.extra.get(k): do_update = True + for k in ('kbart', 'ia', 'doaj'): + # always update with these fields + if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): + do_update = True if ce.publisher and not existing.publisher: do_update = True if ce.wikidata_qid and not existing.wikidata_qid: do_update = True if do_update: - existing.wikidata_qid = ce.wikidata_qid - existing.publisher = ce.publisher + existing.wikidata_qid = existing.wikidata_qid or ce.wikidata_qid + existing.publisher = existing.publisher or ce.publisher existing.container_type = existing.container_type or ce.container_type for k in ('urls', 'webarchive_urls'): # update, which might clobber, but won't remove @@ -135,7 +140,8 @@ class ChoculaImporter(EntityImporter): # all URLs found to be bad), but being conservative for now so # we don't clobber human edits for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', - 'szczepanski', 'doaj'): + 'szczepanski', 'doaj', 'ia', 'scielo', 'kbart', + 'publisher_type', 'platform'): # update/overwrite, but don't remove any existing value if ce.extra.get(k): existing.extra[k] = ce.extra[k] -- cgit v1.2.3 From b675b60a183b57e276ba7044bd9ece0130f2c457 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Aug 2020 12:39:35 -0700 Subject: chocula import update tweaks --- python/fatcat_tools/importers/chocula.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 356ffe42..d08a98e0 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -116,11 +116,11 @@ class ChoculaImporter(EntityImporter): do_update = True if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): do_update = True - for k in ('ezb', 'szczepanski', 'doaj', 'publisher_type', 'platform'): + for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'): if ce.extra.get(k) and not existing.extra.get(k): do_update = True for k in ('kbart', 'ia', 'doaj'): - # always update with these fields + # always update these fields if not equal (chocula override) if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): do_update = True if ce.publisher and not existing.publisher: @@ -133,18 +133,22 @@ class ChoculaImporter(EntityImporter): existing.publisher = existing.publisher or ce.publisher existing.container_type = existing.container_type or ce.container_type for k in ('urls', 'webarchive_urls'): - # update, which might clobber, but won't remove + # always update if available. should probably make this more + # careful/subtle in the future! + # note: in some cases we might *want* to remove existing (if + # all URLs found to be bad), but being + # conservative/inclusionist for now if ce.extra.get(k): existing.extra[k] = ce.extra.get(k, []) - # note: in some cases we might *want* to clobber existing (if - # all URLs found to be bad), but being conservative for now so - # we don't clobber human edits - for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', - 'szczepanski', 'doaj', 'ia', 'scielo', 'kbart', - 'publisher_type', 'platform'): - # update/overwrite, but don't remove any existing value + for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia', + 'scielo', 'kbart', 'publisher_type', 'platform'): + # always update (chocula over-rides) if ce.extra.get(k): existing.extra[k] = ce.extra[k] + for k in ('issne', 'issnp', 'country'): + # only include if not set (don't clobber human edits) + if ce.extra.get(k) and not existing.extra.get(k): + existing.extra[k] = ce.extra[k] if ce.extra.get('languages'): if not existing.extra.get('languages'): existing.extra['languages'] = ce.extra['languages'] -- cgit v1.2.3