diff options
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2d5c89b3..1c99c7d7 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,6 +13,36 @@ import fatcat_client from fatcat_client.rest import ApiException +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up commong unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + return ftfy.fix_text(thing, fix_entities=fix_entities).strip() + +def test_clean(): + + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('<b>a&b</b>') == '<b>a&b</b>' + assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + class EntityImporter: """ Base class for fatcat entity importers. |