diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-07-21 17:05:05 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-07-21 17:05:05 -0700 | 
| commit | f9cc2c2d40c2083549c333064582183c96162e05 (patch) | |
| tree | 59348c62f7a2c4dec3d7b5864812959f6083b2b3 | |
| parent | 40f18e6d26cef6a53138a7a5f0c810d8bf58333d (diff) | |
| download | fatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.tar.gz fatcat-scholar-f9cc2c2d40c2083549c333064582183c96162e05.zip | |
strip <em> tags explicitly
| -rw-r--r-- | fatcat_scholar/schema.py | 1 | 
1 files changed, 1 insertions, 0 deletions
| diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 19f148b..64e9268 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -227,6 +227,7 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:      text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)      text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)      text = re.sub(r"\s+", " ", text).strip() +    text = text.replace("<em>", "").replace("</em>", "")      # hack to remove abstract prefixes      for prefix in UNWANTED_ABSTRACT_PREFIXES: | 
