diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-07-21 17:05:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-07-21 17:05:05 -0700 |
commit | f9cc2c2d40c2083549c333064582183c96162e05 (patch) | |
tree | 59348c62f7a2c4dec3d7b5864812959f6083b2b3 /fatcat_scholar | |
parent | 40f18e6d26cef6a53138a7a5f0c810d8bf58333d (diff) | |
download | fatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.tar.gz fatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.zip |
strip <em> tags explicitly
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/schema.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 19f148b..64e9268 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -227,6 +227,7 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text) text = re.sub(r"[„“]|(\'\')|(,,)", '"', text) text = re.sub(r"\s+", " ", text).strip() + text = text.replace("<em>", "").replace("</em>", "") # hack to remove abstract prefixes for prefix in UNWANTED_ABSTRACT_PREFIXES: |