From 22a277c80ecfe28ce21b7ce215ee38f25a702658 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 29 Jan 2019 15:56:34 -0800 Subject: fix bug in clean() resulting in many consistency check fails --- python/fatcat_tools/importers/common.py | 5 +++-- python/fatcat_tools/importers/crossref.py | 19 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 7c2ce400..32f7b4d5 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -28,7 +28,7 @@ def clean(thing, force_xml=False): Also strips extra whitespace. """ if not thing: - return thing + return None fix_entities = 'auto' if force_xml: fix_entities = True @@ -41,7 +41,8 @@ def clean(thing, force_xml=False): def test_clean(): assert clean(None) == None - assert clean('') == '' + assert clean('') == None + assert clean('1') == None assert clean('123') == '123' assert clean('a&b') == 'a&b' assert clean('a&b') == 'a&b' diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 0512c963..c4e55962 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -173,8 +173,6 @@ class CrossrefImporter(EntityImporter): extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) - if not raw_name or len(raw_name) <= 1: - raw_name = None contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, @@ -193,13 +191,13 @@ class CrossrefImporter(EntityImporter): container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = obj.get('publisher') + publisher = clean(obj.get('publisher')) if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=clean(publisher), + publisher=publisher, container_type=self.map_container_type(release_type), name=clean(obj['container-title'][0], force_xml=True)) ce_edit = self.create_container(ce) @@ -247,7 +245,7 @@ class CrossrefImporter(EntityImporter): 'accessed_date', 'issued', 'page', 'medium', 'collection_title', 'chapter_number'): if clean(rm.get(k)): - extra[k] = clean(rm[k]) + ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None refs.append(fatcat_client.ReleaseRef( @@ -296,7 +294,10 @@ class CrossrefImporter(EntityImporter): extra_crossref['license'] = license_extra if len(obj['title']) > 1: - extra['aliases'] = [clean(t) for t in obj['title'][1:]] + aliases = [clean(t) for t in obj['title'][1:]] + aliases = [t for t in aliases if t] + if aliases: + extra['aliases'] = aliases # ISBN isbn13 = None @@ -343,13 +344,11 @@ class CrossrefImporter(EntityImporter): original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) - if not original_title or len(original_title) < 2: - original_title = None title = None if obj.get('title'): title = clean(obj.get('title')[0], force_xml=True) - if not title or len(title) < 2: + if not title or len(title) <= 1: # title can't be just a single character return None @@ -367,7 +366,7 @@ class CrossrefImporter(EntityImporter): release_status=release_status, release_date=release_date, release_year=release_year, - publisher=clean(publisher), + publisher=publisher, doi=obj['DOI'].lower(), pmid=extids['pmid'], pmcid=extids['pmcid'], -- cgit v1.2.3