summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index a11da8d..55d61ca 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -174,6 +174,17 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
# TODO: should we default to Crossref?
return None
+UNWANTED_ABSTRACT_PREFIXES = [
+ # roughly sort this long to short
+ 'Abstract No Abstract ',
+ 'Publisher Summary ',
+ 'Abstract ',
+ 'ABSTRACT ',
+ 'Summary ',
+ 'Background: ',
+ 'Background ',
+]
+
def scrub_text(raw: str, mimetype: str = None) -> str:
"""
This function takes a mimetype-hinted string and tries to reduce it to a
@@ -191,6 +202,13 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
except Exception as e:
raise e
raw = ftfy.fix_text(raw)
+
+ # hack to remove abstract prefixes
+ for prefix in UNWANTED_ABSTRACT_PREFIXES:
+ if raw.startswith(prefix):
+ raw = raw[len(prefix):]
+ break
+
assert raw, "Empty abstract"
return raw