diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 19:30:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 19:32:50 -0700 |
commit | f9035c7ca9637668911afa7e9345138563aad33e (patch) | |
tree | f6bd0f817190e315d9e8b0016ab1a7e0d5c73c7f | |
parent | 9722f39e38a45d3201c836f0c2805ae9f6c1f581 (diff) | |
download | fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.tar.gz fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.zip |
improve text scrubbing
Was going to use textpipe, but dependency was too large and failed to
install with halfway modern GCC (due to CLD2 issue):
https://github.com/GregBowyer/cld2-cffi/issues/12
So instead basically pulled out the clean_text function, which is quite
short.
-rw-r--r-- | fatcat_scholar/schema.py | 34 | ||||
-rw-r--r-- | tests/test_scrub.py | 15 |
2 files changed, 36 insertions, 13 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 55d61ca..10742fb 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -5,12 +5,14 @@ get serialization for free with those. This is useful for things like auto-conversion of datetime objects. """ -import ftfy +import re import datetime from enum import Enum +from typing import Optional, List, Any + +import ftfy from pydantic import BaseModel from bs4 import BeautifulSoup -from typing import Optional, List, Any from fatcat_openapi_client import ReleaseEntity, ReleaseContrib from fatcat_scholar.api_entities import entity_to_dict @@ -194,23 +196,29 @@ def scrub_text(raw: str, mimetype: str = None) -> str: The output should be clean and "HTML safe" (though should still be escaped in HTML to get entity encoding correct). - TODO: barely implemented yet + TODO: not using mimetype hint for latex yet """ - if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype): - try: - raw = BeautifulSoup(raw, "lxml").get_text() - except Exception as e: - raise e - raw = ftfy.fix_text(raw) + text = ftfy.fix_text(raw) + + # remove HTML + text = BeautifulSoup(text, 'html.parser').get_text() + + # TODO: for performance, compile these as globals? + # Three regexes below adapted from Blendle cleaner.py + # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29 + text = re.sub(r'…', '...', text) + text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text) + text = re.sub(r'[„“]|(\'\')|(,,)', '"', text) + text = re.sub(r'\s+', ' ', text).strip() # hack to remove abstract prefixes for prefix in UNWANTED_ABSTRACT_PREFIXES: - if raw.startswith(prefix): - raw = raw[len(prefix):] + if text.startswith(prefix): + text = text[len(prefix):] break - assert raw, "Empty abstract" - return raw + assert text, "Empty abstract" + return text def contrib_name(contrib: ReleaseContrib) -> str: # TODO: support more cultural normals for name presentation diff --git a/tests/test_scrub.py b/tests/test_scrub.py new file mode 100644 index 0000000..6c357ae --- /dev/null +++ b/tests/test_scrub.py @@ -0,0 +1,15 @@ + +import pytest + +from fatcat_scholar.schema import * + + +def test_scrub(): + vectors = [ + ('“Please clean this piece… of text</b>„', '"Please clean this piece... of text"'), + ("<jats:p>blah", "blah"), + ] + + for raw, fixed in vectors: + assert fixed == scrub_text(raw) + |