6 files changed, 33 insertions, 25 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index aa4ed52..74c80c8 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -186,20 +186,23 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
     TODO: barely implemented yet
     """
     if "<jats" in raw or (mimetype and "application/xml" in mimetype):
-        root = ET.fromstring(raw)
-        raw = " ".join(list(root.itertext())) or ""
+        try:
+            root = ET.fromstring(raw)
+            raw = " ".join(list(root.itertext())) or ""
+        except:
+            pass
     raw = ftfy.fix_text(raw)
     assert raw, "Empty abstract"
     return raw
 
 def contrib_name(contrib: ReleaseContrib) -> str:
     # TODO: support more cultural normals for name presentation
-    if contrib.given_name and contrib.family_name:
-        return f"{contrib.given_name} {contrib.family_name}"
+    if contrib.given_name and contrib.surname:
+        return f"{contrib.given_name} {contrib.surname}"
     elif contrib.raw_name:
         return contrib.raw_name
-    elif contrib.family_name:
-        return contrib.family_name
+    elif contrib.surname:
+        return contrib.surname
     else:
         return contrib.given_name
 
@@ -287,7 +290,8 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
         container_issnl=container_issnl,
         issns=issns,
 
-        contrib_names=[contrib_name(c) for c in release.contribs if c.index],
+        # TODO; these filters sort of meh. refactor to be above?
+        contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs if c.index])),
         contrib_count = len([c for c in release.contribs if c.index]),
         affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
     )
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 08eadae..b816c5b 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -114,13 +114,13 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
         lenient=True,
         fields=[
             "everything",
-            "abstract",
+            "abstracts_all",
             "fulltext.body",
             "fulltext.annex",
         ],
     )
     search = search.highlight(
-        "abstract",
+        "abstracts_all",
         "fulltext.body",
         "fulltext.annex",
         number_of_fragments=3,
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index f2c452f..07cac15 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -105,7 +105,7 @@
       {% if paper.abstracts[0].body|length > 500 %}
         {{ paper.abstracts[0].body[:500] }}...
       {% else %}
-        {{ paper.abstracts[0].body[:500] }}...
+        {{ paper.abstracts[0].body }}
       {% endif %}
     </div>
   {% else %}
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index d858a4c..ab63aa6 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -159,6 +159,8 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil
 
 def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
     obj = teixml2json(tei_xml)
+    if not obj.get('body'):
+        return None
     ret = ScholarFulltext(
         lang_code=obj.get('lang'),
         body=obj.get('body'),
@@ -209,10 +211,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         raise NotImplementedError(f"doc_type: {heavy.doc_type}")
 
     if heavy.grobid_fulltext:
-        
         fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
         fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
-
         fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
 
         # hack to pull through thumbnail from local pdftotext
@@ -221,9 +221,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
 
     if not fulltext and heavy.pdftotext_fulltext:
-
-        fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
-        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+        fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0]
+        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0]
         fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
 
     # TODO: additional access list
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index c93cb29..9ce72b1 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -2,6 +2,7 @@
 import os
 import io
 import sys
+import minio
 import argparse
 from pydantic import BaseModel, validator
 from typing import List, Dict, Tuple, Optional, Any, Sequence
@@ -88,14 +89,17 @@ class WorkPipeline():
         if not grobid_meta or grobid_meta['status'] != 'success':
             return None
         #print(grobid_meta)
-        grobid_xml = self.sandcrawler_s3_client.get_blob(
-            folder="grobid",
-            sha1hex=fe.sha1,
-            extension=".tei.xml",
-            prefix="",
-            bucket="sandcrawler",
-        )
-        #print(grobid_xml)
+        try:
+            grobid_xml = self.sandcrawler_s3_client.get_blob(
+                folder="grobid",
+                sha1hex=fe.sha1,
+                extension=".tei.xml",
+                prefix="",
+                bucket="sandcrawler",
+            )
+            #print(grobid_xml)
+        except minio.error.NoSuchKey:
+            return None
         return dict(
             tei_xml=grobid_xml,
             release_ident=release_ident,
@@ -338,6 +342,7 @@ def main():
             access_key=os.environ.get('MINIO_ACCESS_KEY'),
             secret_key=os.environ.get('MINIO_SECRET_KEY'),
         ),
+        fulltext_cache_dir=args.fulltext_cache_dir,
     )
 
     if args.func == 'run_releases':
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
index 8024702..d5d7852 100644
--- a/schema/scholar_fulltext.v01.json
+++ b/schema/scholar_fulltext.v01.json
@@ -41,7 +41,7 @@
     "dynamic": false,
     "_source": {
       "excludes": [
-        "abstracts.body",
+        "abstracts_all",
         "fulltext.body",
         "fulltext.acknowledgment",
         "fulltext.annex",
@@ -140,7 +140,7 @@
           "type": "nested",
           "dynamic": false,
           "properties": {
-            "body":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"], "store": true },
+            "body":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"] },
             "lang_code":    { "type": "keyword", "normalizer": "default" }
           }
         },