summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-07-21 17:05:05 -0700
committerBryan Newbold <bnewbold@archive.org>2020-07-21 17:05:05 -0700
commitf9cc2c2d40c2083549c333064582183c96162e05 (patch)
tree59348c62f7a2c4dec3d7b5864812959f6083b2b3
parent40f18e6d26cef6a53138a7a5f0c810d8bf58333d (diff)
downloadfatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.tar.gz
fatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.zip
strip <em> tags explicitly
-rw-r--r--fatcat_scholar/schema.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 19f148b..64e9268 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -227,6 +227,7 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)
text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)
text = re.sub(r"\s+", " ", text).strip()
+ text = text.replace("<em>", "").replace("</em>", "")
# hack to remove abstract prefixes
for prefix in UNWANTED_ABSTRACT_PREFIXES: