From 07ff730c6288b9f9548a317027cd3d1f0bf9b22f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 1 Feb 2021 10:03:15 -0800 Subject: catch HTML parsing error from withing html (via bs4) --- fatcat_scholar/schema.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 9b2d2fb..b93962c 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -380,8 +380,15 @@ def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Option text = ftfy.fix_text(raw) - # remove HTML - text = BeautifulSoup(text, "html.parser").get_text() + # remove HTML tags + try: + # TODO: work_h4ufpvlh3rcefacajni7sdndwa as a regression test + # TODO: consider w3clib "remove tags" as an alternative + clean_text = BeautifulSoup(text, "html.parser").get_text() + text = clean_text + except UnboundLocalError: + # TODO: passing through raw string; what should behavior actually be? + pass # TODO: for performance, compile these as globals? # replaces whitespace with single space -- cgit v1.2.3