From 8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 6 Aug 2020 15:35:09 -0700 Subject: scrub_text: single-token strings skipped --- fatcat_scholar/schema.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fatcat_scholar/schema.py') diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 0494ed7..8b09ab3 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = text[len(prefix) :] break + # single word? not "text". eg, random URLs + if len(text.split()) <= 1: + return None + if not text: return None return text -- cgit v1.2.3