summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-29 15:56:34 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-29 15:56:34 -0800
commit22a277c80ecfe28ce21b7ce215ee38f25a702658 (patch)
tree078018a3c20fa9254e33ac14ccd40ce3ef5dbc5e /python/fatcat_tools/importers
parent0720b0c77088e8402a0519da9de655576c74641b (diff)
downloadfatcat-22a277c80ecfe28ce21b7ce215ee38f25a702658.tar.gz
fatcat-22a277c80ecfe28ce21b7ce215ee38f25a702658.zip
fix bug in clean() resulting in many consistency check fails
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/common.py5
-rw-r--r--python/fatcat_tools/importers/crossref.py19
2 files changed, 12 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7c2ce400..32f7b4d5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -28,7 +28,7 @@ def clean(thing, force_xml=False):
Also strips extra whitespace.
"""
if not thing:
- return thing
+ return None
fix_entities = 'auto'
if force_xml:
fix_entities = True
@@ -41,7 +41,8 @@ def clean(thing, force_xml=False):
def test_clean():
assert clean(None) == None
- assert clean('') == ''
+ assert clean('') == None
+ assert clean('1') == None
assert clean('123') == '123'
assert clean('a&amp;b') == 'a&b'
assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 0512c963..c4e55962 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -173,8 +173,6 @@ class CrossrefImporter(EntityImporter):
extra = None
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
- if not raw_name or len(raw_name) <= 1:
- raw_name = None
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
@@ -193,13 +191,13 @@ class CrossrefImporter(EntityImporter):
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = obj.get('publisher')
+ publisher = clean(obj.get('publisher'))
if (container_id is None and self.create_containers and (issnl is not None)
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- publisher=clean(publisher),
+ publisher=publisher,
container_type=self.map_container_type(release_type),
name=clean(obj['container-title'][0], force_xml=True))
ce_edit = self.create_container(ce)
@@ -247,7 +245,7 @@ class CrossrefImporter(EntityImporter):
'accessed_date', 'issued', 'page', 'medium',
'collection_title', 'chapter_number'):
if clean(rm.get(k)):
- extra[k] = clean(rm[k])
+ ref_extra[k] = clean(rm[k])
if not ref_extra:
ref_extra = None
refs.append(fatcat_client.ReleaseRef(
@@ -296,7 +294,10 @@ class CrossrefImporter(EntityImporter):
extra_crossref['license'] = license_extra
if len(obj['title']) > 1:
- extra['aliases'] = [clean(t) for t in obj['title'][1:]]
+ aliases = [clean(t) for t in obj['title'][1:]]
+ aliases = [t for t in aliases if t]
+ if aliases:
+ extra['aliases'] = aliases
# ISBN
isbn13 = None
@@ -343,13 +344,11 @@ class CrossrefImporter(EntityImporter):
original_title = None
if obj.get('original-title'):
original_title = clean(obj.get('original-title')[0], force_xml=True)
- if not original_title or len(original_title) < 2:
- original_title = None
title = None
if obj.get('title'):
title = clean(obj.get('title')[0], force_xml=True)
- if not title or len(title) < 2:
+ if not title or len(title) <= 1:
# title can't be just a single character
return None
@@ -367,7 +366,7 @@ class CrossrefImporter(EntityImporter):
release_status=release_status,
release_date=release_date,
release_year=release_year,
- publisher=clean(publisher),
+ publisher=publisher,
doi=obj['DOI'].lower(),
pmid=extids['pmid'],
pmcid=extids['pmcid'],