improve text scrubbing

Was going to use textpipe, but dependency was too large and failed to install with halfway modern GCC (due to CLD2 issue): https://github.com/GregBowyer/cld2-cffi/issues/12 So instead basically pulled out the clean_text function, which is quite short.
author: Bryan Newbold <bnewbold@archive.org> 2020-06-03 19:30:15 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-03 19:32:50 -0700
commit: f9035c7ca9637668911afa7e9345138563aad33e (patch)
tree: f6bd0f817190e315d9e8b0016ab1a7e0d5c73c7f
parent: 9722f39e38a45d3201c836f0c2805ae9f6c1f581 (diff)
download: fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.tar.gz
fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.zip
2 files changed, 36 insertions, 13 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 55d61ca..10742fb 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -5,12 +5,14 @@ get serialization for free with those. This is useful for things like
 auto-conversion of datetime objects.
 """
 
-import ftfy
+import re
 import datetime
 from enum import Enum
+from typing import Optional, List, Any
+
+import ftfy
 from pydantic import BaseModel
 from bs4 import BeautifulSoup
-from typing import Optional, List, Any
 
 from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
 from fatcat_scholar.api_entities import entity_to_dict
@@ -194,23 +196,29 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
     The output should be clean and "HTML safe" (though should still be escaped
     in HTML to get entity encoding correct).
 
-    TODO: barely implemented yet
+    TODO: not using mimetype hint for latex yet
     """
-    if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype):
-        try:
-            raw = BeautifulSoup(raw, "lxml").get_text()
-        except Exception as e:
-            raise e
-    raw = ftfy.fix_text(raw)
+    text = ftfy.fix_text(raw)
+
+    # remove HTML
+    text = BeautifulSoup(text, 'html.parser').get_text()
+
+    # TODO: for performance, compile these as globals?
+    # Three regexes below adapted from Blendle cleaner.py
+    # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
+    text = re.sub(r'…', '...', text)
+    text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
+    text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
+    text = re.sub(r'\s+', ' ', text).strip()
 
     # hack to remove abstract prefixes
     for prefix in UNWANTED_ABSTRACT_PREFIXES:
-        if raw.startswith(prefix):
-            raw = raw[len(prefix):]
+        if text.startswith(prefix):
+            text = text[len(prefix):]
             break
 
-    assert raw, "Empty abstract"
-    return raw
+    assert text, "Empty abstract"
+    return text
 
 def contrib_name(contrib: ReleaseContrib) -> str:
     # TODO: support more cultural normals for name presentation
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
new file mode 100644
index 0000000..6c357ae
--- /dev/null
+++ b/tests/test_scrub.py
@@ -0,0 +1,15 @@
+
+import pytest
+
+from fatcat_scholar.schema import *
+
+
+def test_scrub():
+    vectors = [
+        ('“Please clean this piece… of text</b>„', '"Please clean this piece... of text"'),
+        ("<jats:p>blah", "blah"),
+    ]
+
+    for raw, fixed in vectors:
+        assert fixed == scrub_text(raw)
+
author	Bryan Newbold <bnewbold@archive.org>	2020-06-03 19:30:15 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-03 19:32:50 -0700
commit	f9035c7ca9637668911afa7e9345138563aad33e (patch)
tree	f6bd0f817190e315d9e8b0016ab1a7e0d5c73c7f
parent	9722f39e38a45d3201c836f0c2805ae9f6c1f581 (diff)
download	fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.tar.gz fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.zip