From 1f078fe94a5cf5322527b97dcdf0cb054e0c7540 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 18:41:30 -0700 Subject: grobid: handle weird whitespace unstructured from crossref See also: https://github.com/kermitt2/grobid/issues/849 --- python/sandcrawler/grobid.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index e28c488..7d7f6b5 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -27,6 +27,7 @@ def clean_crossref_unstructured(raw: str) -> str: raw = html.unescape(raw) raw.replace(" ", " ") + raw = raw.strip() return raw @@ -56,6 +57,14 @@ def test_clean_ref_str() -> None: == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg.""" ) + # all non-breaking whitespace + assert ( + clean_crossref_unstructured( + "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" + ) + == "" + ) + class GrobidClient(object): def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs): @@ -173,7 +182,7 @@ class GrobidClient(object): """ if ref.get("DOI"): return False - if len(ref.get("unstructured", "")) <= 6: + if len(ref.get("unstructured", "").strip()) <= 6: return False # TODO: what other combinations are enough to skip parsing? -- cgit v1.2.3