aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-06 15:35:09 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-06 19:01:12 -0700
commit8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e (patch)
tree24330104b1a85ffe6c3b7e8100bd3224d0f09855 /fatcat_scholar/schema.py
parentd3e4ef4fdf83703d9eda1e7ea7eed9d213f836be (diff)
downloadfatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.tar.gz
fatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.zip
scrub_text: single-token strings skipped
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 0494ed7..8b09ab3 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
text = text[len(prefix) :]
break
+ # single word? not "text". eg, random URLs
+ if len(text.split()) <= 1:
+ return None
+
if not text:
return None
return text