From 5aeb5f79d83a2559671fed6d9afed2b0987139b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jan 2019 15:33:44 -0800 Subject: ftfy all over (needs Pipfile.lock) --- python/fatcat_tools/importers/common.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'python/fatcat_tools/importers/common.py') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2d5c89b3..1c99c7d7 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,6 +13,36 @@ import fatcat_client from fatcat_client.rest import ApiException +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up commong unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + return ftfy.fix_text(thing, fix_entities=fix_entities).strip() + +def test_clean(): + + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('a&b') == 'a&b' + assert clean('a&b', force_xml=True) == 'a&b' + class EntityImporter: """ Base class for fatcat entity importers. -- cgit v1.2.3