aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-03 19:30:15 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-03 19:32:50 -0700
commitf9035c7ca9637668911afa7e9345138563aad33e (patch)
treef6bd0f817190e315d9e8b0016ab1a7e0d5c73c7f
parent9722f39e38a45d3201c836f0c2805ae9f6c1f581 (diff)
downloadfatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.tar.gz
fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.zip
improve text scrubbing
Was going to use textpipe, but dependency was too large and failed to install with halfway modern GCC (due to CLD2 issue): https://github.com/GregBowyer/cld2-cffi/issues/12 So instead basically pulled out the clean_text function, which is quite short.
-rw-r--r--fatcat_scholar/schema.py34
-rw-r--r--tests/test_scrub.py15
2 files changed, 36 insertions, 13 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 55d61ca..10742fb 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -5,12 +5,14 @@ get serialization for free with those. This is useful for things like
auto-conversion of datetime objects.
"""
-import ftfy
+import re
import datetime
from enum import Enum
+from typing import Optional, List, Any
+
+import ftfy
from pydantic import BaseModel
from bs4 import BeautifulSoup
-from typing import Optional, List, Any
from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
from fatcat_scholar.api_entities import entity_to_dict
@@ -194,23 +196,29 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
The output should be clean and "HTML safe" (though should still be escaped
in HTML to get entity encoding correct).
- TODO: barely implemented yet
+ TODO: not using mimetype hint for latex yet
"""
- if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype):
- try:
- raw = BeautifulSoup(raw, "lxml").get_text()
- except Exception as e:
- raise e
- raw = ftfy.fix_text(raw)
+ text = ftfy.fix_text(raw)
+
+ # remove HTML
+ text = BeautifulSoup(text, 'html.parser').get_text()
+
+ # TODO: for performance, compile these as globals?
+ # Three regexes below adapted from Blendle cleaner.py
+ # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
+ text = re.sub(r'…', '...', text)
+ text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
+ text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
+ text = re.sub(r'\s+', ' ', text).strip()
# hack to remove abstract prefixes
for prefix in UNWANTED_ABSTRACT_PREFIXES:
- if raw.startswith(prefix):
- raw = raw[len(prefix):]
+ if text.startswith(prefix):
+ text = text[len(prefix):]
break
- assert raw, "Empty abstract"
- return raw
+ assert text, "Empty abstract"
+ return text
def contrib_name(contrib: ReleaseContrib) -> str:
# TODO: support more cultural normals for name presentation
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
new file mode 100644
index 0000000..6c357ae
--- /dev/null
+++ b/tests/test_scrub.py
@@ -0,0 +1,15 @@
+
+import pytest
+
+from fatcat_scholar.schema import *
+
+
+def test_scrub():
+ vectors = [
+ ('“Please clean this piece… of text</b>„', '"Please clean this piece... of text"'),
+ ("<jats:p>blah", "blah"),
+ ]
+
+ for raw, fixed in vectors:
+ assert fixed == scrub_text(raw)
+