summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-23 15:33:44 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-23 15:33:44 -0800
commit5aeb5f79d83a2559671fed6d9afed2b0987139b4 (patch)
tree3a4cbec2e5307f8b84b15fb703dbc62547a31154 /python/fatcat_tools/importers/common.py
parent1cc4f517390d6cb09155746778a0ae566c9725c7 (diff)
downloadfatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.tar.gz
fatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.zip
ftfy all over (needs Pipfile.lock)
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2d5c89b3..1c99c7d7 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
import sys
import csv
import json
+import ftfy
import itertools
import subprocess
from collections import Counter
@@ -12,6 +13,36 @@ import fatcat_client
from fatcat_client.rest import ApiException
+def clean(thing, force_xml=False):
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
+
+ It will try to clean up commong unicode mangles, HTML characters, etc.
+
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return thing
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ return ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+
+def test_clean():
+
+ assert clean(None) == None
+ assert clean('') == ''
+ assert clean('123') == '123'
+ assert clean('a&amp;b') == 'a&b'
+ assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
class EntityImporter:
"""
Base class for fatcat entity importers.