summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/schema.py4
-rw-r--r--tests/test_scrub.py2
2 files changed, 5 insertions, 1 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 0494ed7..8b09ab3 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
text = text[len(prefix) :]
break
+ # single word? not "text". eg, random URLs
+ if len(text.split()) <= 1:
+ return None
+
if not text:
return None
return text
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index fc6ef22..6541e13 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -7,7 +7,7 @@ def test_scrub() -> None:
"“Please clean this piece… of text</b>„",
'"Please clean this piece... of text"',
),
- ("<jats:p>blah", "blah"),
+ ("<jats:p>blah thing", "blah thing"),
]
for raw, fixed in vectors: