summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 20:02:41 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 20:02:41 -0700
commit9eee6f9aef0469d81e57543f0488254c39ac2b66 (patch)
tree6271ca400e78cef6e854246de095bd57741434ef
parentd5693718e45eb3a635b6cd47559d6e807ae78c21 (diff)
downloadfatcat-scholar-9eee6f9aef0469d81e57543f0488254c39ac2b66.tar.gz
fatcat-scholar-9eee6f9aef0469d81e57543f0488254c39ac2b66.zip
add prefix scrubing (esp. for abstracts)
-rw-r--r--fatcat_scholar/schema.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index a11da8d..55d61ca 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -174,6 +174,17 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
# TODO: should we default to Crossref?
return None
+UNWANTED_ABSTRACT_PREFIXES = [
+ # roughly sort this long to short
+ 'Abstract No Abstract ',
+ 'Publisher Summary ',
+ 'Abstract ',
+ 'ABSTRACT ',
+ 'Summary ',
+ 'Background: ',
+ 'Background ',
+]
+
def scrub_text(raw: str, mimetype: str = None) -> str:
"""
This function takes a mimetype-hinted string and tries to reduce it to a
@@ -191,6 +202,13 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
except Exception as e:
raise e
raw = ftfy.fix_text(raw)
+
+ # hack to remove abstract prefixes
+ for prefix in UNWANTED_ABSTRACT_PREFIXES:
+ if raw.startswith(prefix):
+ raw = raw[len(prefix):]
+ break
+
assert raw, "Empty abstract"
return raw