diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-12 12:27:34 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-12 12:27:34 -0700 |
commit | 3260b880c0003b7d55bd026b9f1d259256648b0e (patch) | |
tree | 5faed3e7a12c7f065dc86cdeb71232f7a12a02dc | |
parent | 02c93dd1b58d9ec26a6835ebd5e0ebbffb4a28cb (diff) | |
download | fatcat-scholar-3260b880c0003b7d55bd026b9f1d259256648b0e.tar.gz fatcat-scholar-3260b880c0003b7d55bd026b9f1d259256648b0e.zip |
transform: more string cleaning
-rw-r--r-- | fatcat_scholar/schema.py | 71 | ||||
-rw-r--r-- | tests/test_scrub.py | 20 |
2 files changed, 78 insertions, 13 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index e369d11..5a5b339 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -224,8 +224,48 @@ UNWANTED_ABSTRACT_PREFIXES = [ "No abstract.", "Introduction: ", "ACKNOWLEDGEMENTS ", + "a b s t r a c t ", ] +UNWANTED_SHORT_STRINGS = [ + "&na", + "n/a", +] + + +def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Optional[str]: + """ + Takes a str and "cleans" it. Intended to be usable with short strings + (names, titles) in any language. See scrub_text(), which extends this + function for paragraph length and longer text fields. + """ + if not raw: + return None + + text = ftfy.fix_text(raw) + + # remove HTML + text = BeautifulSoup(text, "html.parser").get_text() + + # TODO: for performance, compile these as globals? + # replaces whitespace with single space + text = re.sub(r"\s+", " ", text).strip() + + # TODO: shouldn't HTML be parsing these out? + text = text.replace("<em>", "").replace("</em>", "") + + text = text.strip() + + if strip_trailing_period and text.endswith("."): + text = text[:-1] + + if text.lower() in UNWANTED_SHORT_STRINGS: + return None + + if not text: + return None + return text + def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: """ @@ -233,15 +273,17 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: simple token-and-punctuation scheme with any and all markup removed. Eg, HTML tags, JATS XML tags, LaTeX, whatever. + Like clean_str(), but more aggressive about some punctuation, and intended + for text fields (like abstracts), not just short strings. + The output should be clean and "HTML safe" (though should still be escaped in HTML to get entity encoding correct). TODO: not using mimetype hint for latex yet """ - text = ftfy.fix_text(raw) - - # remove HTML - text = BeautifulSoup(text, "html.parser").get_text() + text = clean_str(raw) + if not text: + return None # TODO: for performance, compile these as globals? # Three regexes below adapted from Blendle cleaner.py @@ -249,8 +291,6 @@ def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: text = re.sub(r"…", "...", text) text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text) text = re.sub(r"[„“]|(\'\')|(,,)", '"', text) - text = re.sub(r"\s+", " ", text).strip() - text = text.replace("<em>", "").replace("</em>", "") # hack to remove abstract prefixes for prefix in UNWANTED_ABSTRACT_PREFIXES: @@ -342,9 +382,9 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: ret = ScholarBiblio( release_ident=release.ident, - title=release.title, - subtitle=release.subtitle, - original_title=release.original_title, + title=clean_str(release.title, strip_trailing_period=True), + subtitle=clean_str(release.subtitle, strip_trailing_period=True), + original_title=clean_str(release.original_title, strip_trailing_period=True), release_date=release.release_date, release_year=release.release_year, release_type=release.release_type, @@ -372,7 +412,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: mag_id=release.ext_ids.mag, license_slug=release.license_slug, publisher=publisher, - container_name=container_name, + container_name=clean_str(container_name), container_original_name=container_original_name, container_ident=container_ident, container_type=container_type, @@ -380,13 +420,20 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: issns=issns, # TODO; these filters sort of meh. refactor to be above? contrib_names=list( - filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs]) + filter( + lambda x: bool(x), + [clean_str(contrib_name(c)) for c in release.contribs], + ) ), contrib_count=len([c for c in release.contribs if c.index]), affiliations=list( filter( lambda x: bool(x), - [contrib_affiliation(c) for c in release.contribs if c.index], + [ + clean_str(contrib_affiliation(c)) + for c in release.contribs + if c.index + ], ) ), ) diff --git a/tests/test_scrub.py b/tests/test_scrub.py index 6541e13..b142c10 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -1,4 +1,4 @@ -from fatcat_scholar.schema import scrub_text +from fatcat_scholar.schema import scrub_text, clean_str def test_scrub() -> None: @@ -12,3 +12,21 @@ def test_scrub() -> None: for raw, fixed in vectors: assert fixed == scrub_text(raw) + + +def test_clean_str() -> None: + vectors = [ + ( + "Di� Hekimli�i Fak�ltesi ��rencilerinde Temporomandibular Eklem Rahats�zl�klar�n�n ve A��z Sa�l��� Al��kanl�klar�n�n De�erlendirilmesi", + "Di� Hekimli�i Fak�ltesi ��rencilerinde Temporomandibular Eklem Rahats�zl�klar�n�n ve A��z Sa�l��� Al��kanl�klar�n�n De�erlendirilmesi", + ), + ("<jats:p>blah thing", "blah thing"), + ("title with <i>italics</i>", "title with italics"), + ("title with <sup>partial super", "title with partial super"), + ("", None), + ("&NA", None), + (None, None), + ] + + for raw, fixed in vectors: + assert fixed == clean_str(raw) |