diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 15:35:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 19:01:12 -0700 |
commit | 8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e (patch) | |
tree | 24330104b1a85ffe6c3b7e8100bd3224d0f09855 | |
parent | d3e4ef4fdf83703d9eda1e7ea7eed9d213f836be (diff) | |
download | fatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.tar.gz fatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.zip |
scrub_text: single-token strings skipped
-rw-r--r-- | fatcat_scholar/schema.py | 4 | ||||
-rw-r--r-- | tests/test_scrub.py | 2 |
2 files changed, 5 insertions, 1 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 0494ed7..8b09ab3 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = text[len(prefix) :] break + # single word? not "text". eg, random URLs + if len(text.split()) <= 1: + return None + if not text: return None return text diff --git a/tests/test_scrub.py b/tests/test_scrub.py index fc6ef22..6541e13 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -7,7 +7,7 @@ def test_scrub() -> None: "“Please clean this piece… of text</b>„", '"Please clean this piece... of text"', ), - ("<jats:p>blah", "blah"), + ("<jats:p>blah thing", "blah thing"), ] for raw, fixed in vectors: |