summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-02-01 10:03:15 -0800
committerBryan Newbold <bnewbold@archive.org>2021-02-01 10:03:15 -0800
commit07ff730c6288b9f9548a317027cd3d1f0bf9b22f (patch)
treeb7a0df1dc3c74d051cfe438b4d34584b18ca6d2c
parent1cc32fcc46f2c7ff7a05ca81c6925ce4ef92c03f (diff)
downloadfatcat-scholar-07ff730c6288b9f9548a317027cd3d1f0bf9b22f.tar.gz
fatcat-scholar-07ff730c6288b9f9548a317027cd3d1f0bf9b22f.zip
catch HTML parsing error from withing html (via bs4)
-rw-r--r--fatcat_scholar/schema.py11
1 files changed, 9 insertions, 2 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 9b2d2fb..b93962c 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -380,8 +380,15 @@ def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Option
text = ftfy.fix_text(raw)
- # remove HTML
- text = BeautifulSoup(text, "html.parser").get_text()
+ # remove HTML tags
+ try:
+ # TODO: work_h4ufpvlh3rcefacajni7sdndwa as a regression test
+ # TODO: consider w3clib "remove tags" as an alternative
+ clean_text = BeautifulSoup(text, "html.parser").get_text()
+ text = clean_text
+ except UnboundLocalError:
+ # TODO: passing through raw string; what should behavior actually be?
+ pass
# TODO: for performance, compile these as globals?
# replaces whitespace with single space