From 9eee6f9aef0469d81e57543f0488254c39ac2b66 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 May 2020 20:02:41 -0700 Subject: add prefix scrubing (esp. for abstracts) --- fatcat_scholar/schema.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index a11da8d..55d61ca 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -174,6 +174,17 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]: # TODO: should we default to Crossref? return None +UNWANTED_ABSTRACT_PREFIXES = [ + # roughly sort this long to short + 'Abstract No Abstract ', + 'Publisher Summary ', + 'Abstract ', + 'ABSTRACT ', + 'Summary ', + 'Background: ', + 'Background ', +] + def scrub_text(raw: str, mimetype: str = None) -> str: """ This function takes a mimetype-hinted string and tries to reduce it to a @@ -191,6 +202,13 @@ def scrub_text(raw: str, mimetype: str = None) -> str: except Exception as e: raise e raw = ftfy.fix_text(raw) + + # hack to remove abstract prefixes + for prefix in UNWANTED_ABSTRACT_PREFIXES: + if raw.startswith(prefix): + raw = raw[len(prefix):] + break + assert raw, "Empty abstract" return raw -- cgit v1.2.3