aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 20:00:10 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 20:00:10 -0700
commitd5693718e45eb3a635b6cd47559d6e807ae78c21 (patch)
tree95989ca3fee27382628fd34d2b1c9b6de8961b35
parent6330b8184b2c6cf7dd08144873759a56e705928c (diff)
downloadfatcat-scholar-d5693718e45eb3a635b6cd47559d6e807ae78c21.tar.gz
fatcat-scholar-d5693718e45eb3a635b6cd47559d6e807ae78c21.zip
use beautiful soup for XML scrubing
-rw-r--r--fatcat_scholar/schema.py13
1 files changed, 6 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index c987a17..a11da8d 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -8,9 +8,9 @@ auto-conversion of datetime objects.
import ftfy
import datetime
from enum import Enum
-from typing import Optional, List, Any
-from xml.etree import cElementTree as ET
from pydantic import BaseModel
+from bs4 import BeautifulSoup
+from typing import Optional, List, Any
from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
from fatcat_scholar.api_entities import entity_to_dict
@@ -185,12 +185,11 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
TODO: barely implemented yet
"""
- if "<jats" in raw or (mimetype and "application/xml" in mimetype):
+ if "<jats" in raw or "/>" in raw or (mimetype and "application/xml" in mimetype):
try:
- root = ET.fromstring(raw)
- raw = " ".join(list(root.itertext())) or ""
- except:
- pass
+ raw = BeautifulSoup(raw, "lxml").get_text()
+ except Exception as e:
+ raise e
raw = ftfy.fix_text(raw)
assert raw, "Empty abstract"
return raw