7 files changed, 74 insertions, 39 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index b709f714..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
 
 """
 
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2d5c89b3..1c99c7d7 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
 import sys
 import csv
 import json
+import ftfy
 import itertools
 import subprocess
 from collections import Counter
@@ -12,6 +13,36 @@ import fatcat_client
 from fatcat_client.rest import ApiException
 
 
+def clean(thing, force_xml=False):
+    """
+    This function is appropriate to be called on any random, non-markup string,
+    such as author names, titles, etc.
+
+    It will try to clean up commong unicode mangles, HTML characters, etc.
+
+    This will detect XML/HTML and "do the right thing" (aka, not remove
+    entities like '&amp' if there are tags in the string), unless you pass the
+    'force_xml' parameter, which might be appropriate for, eg, names and
+    titles, which generally should be projected down to plain text.
+
+    Also strips extra whitespace.
+    """
+    if not thing:
+        return thing
+    fix_entities = 'auto'
+    if force_xml:
+        fix_entities = True
+    return ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+
+def test_clean():
+
+    assert clean(None) == None
+    assert clean('') == ''
+    assert clean('123') == '123'
+    assert clean('a&amp;b') == 'a&b'
+    assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+    assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
 class EntityImporter:
     """
     Base class for fatcat entity importers.
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 22abd08d..cbb6deb5 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
 import itertools
 import subprocess
 import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
 
 
 # The docs/guide should be the cannonical home for these mappings; update there
@@ -169,7 +169,7 @@ class CrossrefImporter(EntityImporter):
                         raw_affiliation = am.get('affiliation')[0]['name']
                     if len(am.get('affiliation')) > 1:
                         # note: affiliation => affiliations
-                        extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]]
+                        extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
                 if am.get('sequence') and am.get('sequence') != "additional":
                     extra['sequence'] = am.get('sequence')
                 if not extra:
@@ -178,8 +178,8 @@ class CrossrefImporter(EntityImporter):
                 contribs.append(fatcat_client.ReleaseContrib(
                     creator_id=creator_id,
                     index=index,
-                    raw_name=raw_name,
-                    raw_affiliation=raw_affiliation,
+                    raw_name=clean(raw_name),
+                    raw_affiliation=clean(raw_affiliation),
                     role=ctype,
                     extra=extra))
             return contribs
@@ -199,9 +199,9 @@ class CrossrefImporter(EntityImporter):
             and obj.get('container-title') and len(obj['container-title']) > 0):
             ce = fatcat_client.ContainerEntity(
                 issnl=issnl,
-                publisher=publisher,
+                publisher=clean(publisher),
                 container_type=self.map_container_type(release_type),
-                name=obj['container-title'][0])
+                name=clean(obj['container-title'][0], force_xml=True))
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
 
@@ -257,10 +257,10 @@ class CrossrefImporter(EntityImporter):
                 # doing lookups would be a second import pass
                 target_release_id=None,
                 key=key,
-                year=year,
-                container_name=container_name,
-                title=ref_title,
-                locator=ref_locator,
+                year=clean(year),
+                container_name=clean(container_name),
+                title=clean(ref_title),
+                locator=clean(ref_locator),
                 # TODO: just dump JSON somewhere here?
                 extra=extra))
 
@@ -269,7 +269,7 @@ class CrossrefImporter(EntityImporter):
         if obj.get('abstract') != None:
             abstracts.append(fatcat_client.ReleaseEntityAbstracts(
                 mimetype="application/xml+jats",
-                content=obj.get('abstract')))
+                content=clean(obj.get('abstract'))))
 
         # extra fields
         extra = dict()
@@ -279,13 +279,16 @@ class CrossrefImporter(EntityImporter):
             # TODO: unpack "container-title" array
             val = obj.get(key)
             if val:
-                extra[key] = val
+                if type(val) == str:
+                    extra[key] = clean(val)
+                else:
+                    extra[key] = val
         if 'license' in extra and extra['license']:
             for i in range(len(extra['license'])):
                 if 'start' in extra['license'][i]:
                     extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
         if len(obj['title']) > 1:
-            extra['other-titles'] = obj['title'][1:]
+            extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
         # TODO: this should be top-level
         extra['is_kept'] = len(obj.get('archive', [])) > 0
 
@@ -329,13 +332,13 @@ class CrossrefImporter(EntityImporter):
         re = fatcat_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
-            title=obj.get('title', [None])[0],
-            original_title=obj.get('original-title', [None])[0],
+            title=clean(obj.get('title', [None])[0], force_xml=True),
+            original_title=clean(obj.get('original-title', [None])[0]),
             release_type=release_type,
             release_status=release_status,
             release_date=release_date,
             release_year=release_year,
-            publisher=publisher,
+            publisher=clean(publisher),
             doi=obj['DOI'].lower(),
             pmid=extids['pmid'],
             pmcid=extids['pmcid'],
@@ -344,9 +347,9 @@ class CrossrefImporter(EntityImporter):
             core_id=extids['core_id'],
             arxiv_id=extids['arxiv_id'],
             jstor_id=extids['jstor_id'],
-            volume=obj.get('volume'),
-            issue=obj.get('issue'),
-            pages=obj.get('page'),
+            volume=clean(obj.get('volume')),
+            issue=clean(obj.get('issue')),
+            pages=clean(obj.get('page')),
             language=None,  # crossref doesn't supply language info
             license_slug=license_slug,
             extra=dict(crossref=extra),
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 4d3b41bc..468b0ede 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,7 +5,7 @@ import json
 import base64
 import datetime
 import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
 
 MAX_ABSTRACT_BYTES=4096
 
@@ -82,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
             abobj = dict(
                 mimetype="text/plain",
                 language=None,
-                content=obj.get('abstract').strip())
+                content=clean(obj.get('abstract')))
             abstracts = [abobj]
         else:
             abstracts = None
@@ -91,17 +91,18 @@ class GrobidMetadataImporter(EntityImporter):
         for i, a in enumerate(obj.get('authors', [])):
             contribs.append(fatcat_client.ReleaseContrib(
                 index=i,
-                raw_name=a['name'],
+                raw_name=clean(a['name']),
                 role="author",
                 extra=None))
 
+        # XXX: why is this a dict()? not covered by tests?
         refs = []
         for raw in obj.get('citations', []):
             cite_extra = dict()
             ref = dict()
-            ref['key'] = raw.get('id')
+            ref['key'] = clean(raw.get('id'))
             if raw.get('title'):
-                ref['title'] = raw['title'].strip()
+                ref['title'] = clean(raw['title'])
             if raw.get('date'):
                 try:
                     year = int(raw['date'].strip()[:4])
@@ -110,9 +111,9 @@ class GrobidMetadataImporter(EntityImporter):
                     pass
             for key in ('volume', 'url', 'issue', 'publisher'):
                 if raw.get(key):
-                    cite_extra[key] = raw[key].strip()
+                    cite_extra[key] = clean(raw[key])
             if raw.get('authors'):
-                cite_extra['authors'] = [a['name'] for a in raw['authors']]
+                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
             if cite_extra:
                 cite_extra = dict(grobid=cite_extra)
             else:
@@ -141,15 +142,15 @@ class GrobidMetadataImporter(EntityImporter):
             extra = None
 
         re = fatcat_client.ReleaseEntity(
-            title=obj['title'].strip(),
+            title=clean(obj['title'], force_xml=True),
             release_type="article-journal",
             release_date=release_date,
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=obj['journal'].get('publisher'),
-            volume=obj['journal'].get('volume'),
-            issue=obj['journal'].get('issue'),
+            publisher=clean(obj['journal'].get('publisher')),
+            volume=clean(obj['journal'].get('volume')),
+            issue=clean(obj['journal'].get('issue')),
             abstracts=abstracts,
             extra=extra)
         return re
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index ff38cd77..ccdb7ec6 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,7 @@ import sys
 import json
 import itertools
 import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
 
 
 def or_none(s):
@@ -72,8 +72,8 @@ class JournalMetadataImporter(EntityImporter):
         )
         ce = fatcat_client.ContainerEntity(
             issnl=issnl,
-            name=title,
-            publisher=or_none(row['publisher']),
+            name=clean(title),
+            publisher=or_none(clean(row['publisher'])),
             extra=extra)
         return ce
 
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 2be15860..055f9c6a 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
 import sqlite3
 import itertools
 import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
 
 
 class MatchedImporter(EntityImporter):
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 2c39db18..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
 import json
 import itertools
 import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
 
 def value_or_none(e):
     if type(e) == dict:
@@ -63,9 +63,9 @@ class OrcidImporter(EntityImporter):
             return None
         ce = fatcat_client.CreatorEntity(
             orcid=orcid,
-            given_name=given,
-            surname=sur,
-            display_name=display,
+            given_name=clean(given),
+            surname=clean(sur),
+            display_name=clean(display),
             extra=extra)
         return ce