From d5693718e45eb3a635b6cd47559d6e807ae78c21 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 May 2020 20:00:10 -0700 Subject: use beautiful soup for XML scrubing --- fatcat_scholar/schema.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fatcat_scholar/schema.py') diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index c987a17..a11da8d 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -8,9 +8,9 @@ auto-conversion of datetime objects. import ftfy import datetime from enum import Enum -from typing import Optional, List, Any -from xml.etree import cElementTree as ET from pydantic import BaseModel +from bs4 import BeautifulSoup +from typing import Optional, List, Any from fatcat_openapi_client import ReleaseEntity, ReleaseContrib from fatcat_scholar.api_entities import entity_to_dict @@ -185,12 +185,11 @@ def scrub_text(raw: str, mimetype: str = None) -> str: TODO: barely implemented yet """ - if "" in raw or (mimetype and "application/xml" in mimetype): try: - root = ET.fromstring(raw) - raw = " ".join(list(root.itertext())) or "" - except: - pass + raw = BeautifulSoup(raw, "lxml").get_text() + except Exception as e: + raise e raw = ftfy.fix_text(raw) assert raw, "Empty abstract" return raw -- cgit v1.2.3