From 1f078fe94a5cf5322527b97dcdf0cb054e0c7540 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 3 Nov 2021 18:41:30 -0700
Subject: grobid: handle weird whitespace unstructured from crossref

See also: https://github.com/kermitt2/grobid/issues/849
---
 python/sandcrawler/grobid.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index e28c488..7d7f6b5 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -27,6 +27,7 @@ def clean_crossref_unstructured(raw: str) -> str:
         raw = html.unescape(raw)
 
     raw.replace("  ", " ")
+    raw = raw.strip()
     return raw
 
 
@@ -56,6 +57,14 @@ def test_clean_ref_str() -> None:
         == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
     )
 
+    # all non-breaking whitespace
+    assert (
+        clean_crossref_unstructured(
+            "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+        )
+        == ""
+    )
+
 
 class GrobidClient(object):
     def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
@@ -173,7 +182,7 @@ class GrobidClient(object):
         """
         if ref.get("DOI"):
             return False
-        if len(ref.get("unstructured", "")) <= 6:
+        if len(ref.get("unstructured", "").strip()) <= 6:
             return False
 
         # TODO: what other combinations are enough to skip parsing?
-- 
cgit v1.2.3