diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 20:00:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 20:00:10 -0700 |
commit | d5693718e45eb3a635b6cd47559d6e807ae78c21 (patch) | |
tree | 95989ca3fee27382628fd34d2b1c9b6de8961b35 | |
parent | 6330b8184b2c6cf7dd08144873759a56e705928c (diff) | |
download | fatcat-scholar-d5693718e45eb3a635b6cd47559d6e807ae78c21.tar.gz fatcat-scholar-d5693718e45eb3a635b6cd47559d6e807ae78c21.zip |
use beautiful soup for XML scrubing
-rw-r--r-- | fatcat_scholar/schema.py | 13 |
1 files changed, 6 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index c987a17..a11da8d 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -8,9 +8,9 @@ auto-conversion of datetime objects. import ftfy import datetime from enum import Enum -from typing import Optional, List, Any -from xml.etree import cElementTree as ET from pydantic import BaseModel +from bs4 import BeautifulSoup +from typing import Optional, List, Any from fatcat_openapi_client import ReleaseEntity, ReleaseContrib from fatcat_scholar.api_entities import entity_to_dict @@ -185,12 +185,11 @@ def scrub_text(raw: str, mimetype: str = None) -> str: TODO: barely implemented yet """ - if "<jats" in raw or (mimetype and "application/xml" in mimetype): + if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype): try: - root = ET.fromstring(raw) - raw = " ".join(list(root.itertext())) or "" - except: - pass + raw = BeautifulSoup(raw, "lxml").get_text() + except Exception as e: + raise e raw = ftfy.fix_text(raw) assert raw, "Empty abstract" return raw |