diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 15:33:44 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 15:33:44 -0800 |
commit | 5aeb5f79d83a2559671fed6d9afed2b0987139b4 (patch) | |
tree | 3a4cbec2e5307f8b84b15fb703dbc62547a31154 /python/fatcat_tools/importers/common.py | |
parent | 1cc4f517390d6cb09155746778a0ae566c9725c7 (diff) | |
download | fatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.tar.gz fatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.zip |
ftfy all over (needs Pipfile.lock)
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2d5c89b3..1c99c7d7 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,6 +13,36 @@ import fatcat_client from fatcat_client.rest import ApiException +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up commong unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + return ftfy.fix_text(thing, fix_entities=fix_entities).strip() + +def test_clean(): + + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('<b>a&b</b>') == '<b>a&b</b>' + assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + class EntityImporter: """ Base class for fatcat entity importers. |