diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 20:02:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 20:02:41 -0700 |
commit | 9eee6f9aef0469d81e57543f0488254c39ac2b66 (patch) | |
tree | 6271ca400e78cef6e854246de095bd57741434ef /fatcat_scholar | |
parent | d5693718e45eb3a635b6cd47559d6e807ae78c21 (diff) | |
download | fatcat-scholar-9eee6f9aef0469d81e57543f0488254c39ac2b66.tar.gz fatcat-scholar-9eee6f9aef0469d81e57543f0488254c39ac2b66.zip |
add prefix scrubing (esp. for abstracts)
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/schema.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index a11da8d..55d61ca 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -174,6 +174,17 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]: # TODO: should we default to Crossref? return None +UNWANTED_ABSTRACT_PREFIXES = [ + # roughly sort this long to short + 'Abstract No Abstract ', + 'Publisher Summary ', + 'Abstract ', + 'ABSTRACT ', + 'Summary ', + 'Background: ', + 'Background ', +] + def scrub_text(raw: str, mimetype: str = None) -> str: """ This function takes a mimetype-hinted string and tries to reduce it to a @@ -191,6 +202,13 @@ def scrub_text(raw: str, mimetype: str = None) -> str: except Exception as e: raise e raw = ftfy.fix_text(raw) + + # hack to remove abstract prefixes + for prefix in UNWANTED_ABSTRACT_PREFIXES: + if raw.startswith(prefix): + raw = raw[len(prefix):] + break + assert raw, "Empty abstract" return raw |