diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 15:35:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 19:01:12 -0700 |
commit | 8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e (patch) | |
tree | 24330104b1a85ffe6c3b7e8100bd3224d0f09855 /fatcat_scholar/schema.py | |
parent | d3e4ef4fdf83703d9eda1e7ea7eed9d213f836be (diff) | |
download | fatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.tar.gz fatcat-scholar-8b51716fa6bb66f1ac3cf75ce5b64b4138e5935e.zip |
scrub_text: single-token strings skipped
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r-- | fatcat_scholar/schema.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 0494ed7..8b09ab3 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -239,6 +239,10 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = text[len(prefix) :] break + # single word? not "text". eg, random URLs + if len(text.split()) <= 1: + return None + if not text: return None return text |