diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 19:30:15 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 19:32:50 -0700 | 
| commit | f9035c7ca9637668911afa7e9345138563aad33e (patch) | |
| tree | f6bd0f817190e315d9e8b0016ab1a7e0d5c73c7f | |
| parent | 9722f39e38a45d3201c836f0c2805ae9f6c1f581 (diff) | |
| download | fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.tar.gz fatcat-scholar-f9035c7ca9637668911afa7e9345138563aad33e.zip | |
improve text scrubbing
Was going to use textpipe, but dependency was too large and failed to
install with halfway modern GCC (due to CLD2 issue):
  https://github.com/GregBowyer/cld2-cffi/issues/12
So instead basically pulled out the clean_text function, which is quite
short.
| -rw-r--r-- | fatcat_scholar/schema.py | 34 | ||||
| -rw-r--r-- | tests/test_scrub.py | 15 | 
2 files changed, 36 insertions, 13 deletions
| diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 55d61ca..10742fb 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -5,12 +5,14 @@ get serialization for free with those. This is useful for things like  auto-conversion of datetime objects.  """ -import ftfy +import re  import datetime  from enum import Enum +from typing import Optional, List, Any + +import ftfy  from pydantic import BaseModel  from bs4 import BeautifulSoup -from typing import Optional, List, Any  from fatcat_openapi_client import ReleaseEntity, ReleaseContrib  from fatcat_scholar.api_entities import entity_to_dict @@ -194,23 +196,29 @@ def scrub_text(raw: str, mimetype: str = None) -> str:      The output should be clean and "HTML safe" (though should still be escaped      in HTML to get entity encoding correct). -    TODO: barely implemented yet +    TODO: not using mimetype hint for latex yet      """ -    if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype): -        try: -            raw = BeautifulSoup(raw, "lxml").get_text() -        except Exception as e: -            raise e -    raw = ftfy.fix_text(raw) +    text = ftfy.fix_text(raw) + +    # remove HTML +    text = BeautifulSoup(text, 'html.parser').get_text() + +    # TODO: for performance, compile these as globals? +    # Three regexes below adapted from Blendle cleaner.py +    # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29 +    text = re.sub(r'…', '...', text) +    text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text) +    text = re.sub(r'[„“]|(\'\')|(,,)', '"', text) +    text = re.sub(r'\s+', ' ', text).strip()      # hack to remove abstract prefixes      for prefix in UNWANTED_ABSTRACT_PREFIXES: -        if raw.startswith(prefix): -            raw = raw[len(prefix):] +        if text.startswith(prefix): +            text = text[len(prefix):]              break -    assert raw, "Empty abstract" -    return raw +    assert text, "Empty abstract" +    return text  def contrib_name(contrib: ReleaseContrib) -> str:      # TODO: support more cultural normals for name presentation diff --git a/tests/test_scrub.py b/tests/test_scrub.py new file mode 100644 index 0000000..6c357ae --- /dev/null +++ b/tests/test_scrub.py @@ -0,0 +1,15 @@ + +import pytest + +from fatcat_scholar.schema import * + + +def test_scrub(): +    vectors = [ +        ('“Please clean this piece… of text</b>„', '"Please clean this piece... of text"'), +        ("<jats:p>blah", "blah"), +    ] + +    for raw, fixed in vectors: +        assert fixed == scrub_text(raw) + | 
