aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-12 12:27:34 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-12 12:27:34 -0700
commit3260b880c0003b7d55bd026b9f1d259256648b0e (patch)
tree5faed3e7a12c7f065dc86cdeb71232f7a12a02dc
parent02c93dd1b58d9ec26a6835ebd5e0ebbffb4a28cb (diff)
downloadfatcat-scholar-3260b880c0003b7d55bd026b9f1d259256648b0e.tar.gz
fatcat-scholar-3260b880c0003b7d55bd026b9f1d259256648b0e.zip
transform: more string cleaning
-rw-r--r--fatcat_scholar/schema.py71
-rw-r--r--tests/test_scrub.py20
2 files changed, 78 insertions, 13 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index e369d11..5a5b339 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -224,8 +224,48 @@ UNWANTED_ABSTRACT_PREFIXES = [
"No abstract.",
"Introduction: ",
"ACKNOWLEDGEMENTS ",
+ "a b s t r a c t ",
]
+UNWANTED_SHORT_STRINGS = [
+ "&na",
+ "n/a",
+]
+
+
+def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Optional[str]:
+ """
+ Takes a str and "cleans" it. Intended to be usable with short strings
+ (names, titles) in any language. See scrub_text(), which extends this
+ function for paragraph length and longer text fields.
+ """
+ if not raw:
+ return None
+
+ text = ftfy.fix_text(raw)
+
+ # remove HTML
+ text = BeautifulSoup(text, "html.parser").get_text()
+
+ # TODO: for performance, compile these as globals?
+ # replaces whitespace with single space
+ text = re.sub(r"\s+", " ", text).strip()
+
+ # TODO: shouldn't HTML be parsing these out?
+ text = text.replace("<em>", "").replace("</em>", "")
+
+ text = text.strip()
+
+ if strip_trailing_period and text.endswith("."):
+ text = text[:-1]
+
+ if text.lower() in UNWANTED_SHORT_STRINGS:
+ return None
+
+ if not text:
+ return None
+ return text
+
def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
"""
@@ -233,15 +273,17 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
simple token-and-punctuation scheme with any and all markup removed. Eg,
HTML tags, JATS XML tags, LaTeX, whatever.
+ Like clean_str(), but more aggressive about some punctuation, and intended
+ for text fields (like abstracts), not just short strings.
+
The output should be clean and "HTML safe" (though should still be escaped
in HTML to get entity encoding correct).
TODO: not using mimetype hint for latex yet
"""
- text = ftfy.fix_text(raw)
-
- # remove HTML
- text = BeautifulSoup(text, "html.parser").get_text()
+ text = clean_str(raw)
+ if not text:
+ return None
# TODO: for performance, compile these as globals?
# Three regexes below adapted from Blendle cleaner.py
@@ -249,8 +291,6 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
text = re.sub(r"…", "...", text)
text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)
text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)
- text = re.sub(r"\s+", " ", text).strip()
- text = text.replace("<em>", "").replace("</em>", "")
# hack to remove abstract prefixes
for prefix in UNWANTED_ABSTRACT_PREFIXES:
@@ -342,9 +382,9 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
ret = ScholarBiblio(
release_ident=release.ident,
- title=release.title,
- subtitle=release.subtitle,
- original_title=release.original_title,
+ title=clean_str(release.title, strip_trailing_period=True),
+ subtitle=clean_str(release.subtitle, strip_trailing_period=True),
+ original_title=clean_str(release.original_title, strip_trailing_period=True),
release_date=release.release_date,
release_year=release.release_year,
release_type=release.release_type,
@@ -372,7 +412,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
mag_id=release.ext_ids.mag,
license_slug=release.license_slug,
publisher=publisher,
- container_name=container_name,
+ container_name=clean_str(container_name),
container_original_name=container_original_name,
container_ident=container_ident,
container_type=container_type,
@@ -380,13 +420,20 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
issns=issns,
# TODO; these filters sort of meh. refactor to be above?
contrib_names=list(
- filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])
+ filter(
+ lambda x: bool(x),
+ [clean_str(contrib_name(c)) for c in release.contribs],
+ )
),
contrib_count=len([c for c in release.contribs if c.index]),
affiliations=list(
filter(
lambda x: bool(x),
- [contrib_affiliation(c) for c in release.contribs if c.index],
+ [
+ clean_str(contrib_affiliation(c))
+ for c in release.contribs
+ if c.index
+ ],
)
),
)
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index 6541e13..b142c10 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -1,4 +1,4 @@
-from fatcat_scholar.schema import scrub_text
+from fatcat_scholar.schema import scrub_text, clean_str
def test_scrub() -> None:
@@ -12,3 +12,21 @@ def test_scrub() -> None:
for raw, fixed in vectors:
assert fixed == scrub_text(raw)
+
+
+def test_clean_str() -> None:
+ vectors = [
+ (
+ "Di� Hekimli�i Fak�ltesi ��rencilerinde Temporomandibular Eklem Rahats�zl�klar�n�n ve A��z Sa�l��� Al��kanl�klar�n�n De�erlendirilmesi",
+ "Di� Hekimli�i Fak�ltesi ��rencilerinde Temporomandibular Eklem Rahats�zl�klar�n�n ve A��z Sa�l��� Al��kanl�klar�n�n De�erlendirilmesi",
+ ),
+ ("<jats:p>blah thing", "blah thing"),
+ ("title with <i>italics</i>", "title with italics"),
+ ("title with <sup>partial super", "title with partial super"),
+ ("", None),
+ ("&NA", None),
+ (None, None),
+ ]
+
+ for raw, fixed in vectors:
+ assert fixed == clean_str(raw)