diff options
-rw-r--r-- | fatcat_scholar/schema.py | 4 | ||||
-rw-r--r-- | tests/test_scrub.py | 2 |
2 files changed, 5 insertions, 1 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 0494ed7..8b09ab3 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = text[len(prefix) :] break + # single word? not "text". eg, random URLs + if len(text.split()) <= 1: + return None + if not text: return None return text diff --git a/tests/test_scrub.py b/tests/test_scrub.py index fc6ef22..6541e13 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -7,7 +7,7 @@ def test_scrub() -> None: "“Please clean this piece… of text</b>„", '"Please clean this piece... of text"', ), - ("<jats:p>blah", "blah"), + ("<jats:p>blah thing", "blah thing"), ] for raw, fixed in vectors: |