From 1177dafb9b185c7b749ff95ded1a0720792fbb5e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 15:15:44 -0800 Subject: chocula importer: tweak counting, conditions for doing updates --- python/fatcat_tools/importers/chocula.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 1ddefa5e..c2f2199d 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -31,9 +31,12 @@ class ChoculaImporter(EntityImporter): if not raw_record.get("ident") and not raw_record.get("_known_issnl"): self.counts["skip-unknown-new-issnl"] += 1 return False + if raw_record.get("issnl") and raw_record.get("name"): return True - return False + else: + self.counts["skip-partial-metadata"] += 1 + return False def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ @@ -125,23 +128,14 @@ class ChoculaImporter(EntityImporter): return False if not existing.extra: existing.extra = dict() - if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set( - existing.extra.get("urls", []) - ): + if ce.extra.get("urls") and not ce.extra.get("urls", []): do_update = True - if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set( - existing.extra.get("webarchive_urls", []) - ): + if ce.extra.get("webarchive_urls") and not ce.extra.get("webarchive_urls", []): do_update = True - for k in ("ezb", "szczepanski", "publisher_type", "platform"): - if ce.extra.get(k) and not existing.extra.get(k): - do_update = True for k in ("kbart", "ia", "doaj"): # always update these fields if not equal (chocula override) if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): do_update = True - if ce.publisher and not existing.publisher: - do_update = True if ce.wikidata_qid and not existing.wikidata_qid: do_update = True @@ -171,13 +165,11 @@ class ChoculaImporter(EntityImporter): "ia", "scielo", "kbart", - "publisher_type", - "platform", ): # always update (chocula over-rides) if ce.extra.get(k): existing.extra[k] = ce.extra[k] - for k in ("country",): + for k in ("country", "publisher_type", "platform"): # only include if not set (don't clobber human edits) if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra[k] -- cgit v1.2.3