aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index e28c488..7d7f6b5 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -27,6 +27,7 @@ def clean_crossref_unstructured(raw: str) -> str:
raw = html.unescape(raw)
raw.replace(" ", " ")
+ raw = raw.strip()
return raw
@@ -56,6 +57,14 @@ def test_clean_ref_str() -> None:
== """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
)
+ # all non-breaking whitespace
+ assert (
+ clean_crossref_unstructured(
+ "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+ )
+ == ""
+ )
+
class GrobidClient(object):
def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
@@ -173,7 +182,7 @@ class GrobidClient(object):
"""
if ref.get("DOI"):
return False
- if len(ref.get("unstructured", "")) <= 6:
+ if len(ref.get("unstructured", "").strip()) <= 6:
return False
# TODO: what other combinations are enough to skip parsing?