aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/schema.py18
-rw-r--r--fatcat_scholar/search.py4
-rw-r--r--fatcat_scholar/templates/search_macros.html2
-rw-r--r--fatcat_scholar/transform.py9
-rw-r--r--fatcat_scholar/work_pipeline.py21
-rw-r--r--schema/scholar_fulltext.v01.json4
6 files changed, 33 insertions, 25 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index aa4ed52..74c80c8 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -186,20 +186,23 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
TODO: barely implemented yet
"""
if "<jats" in raw or (mimetype and "application/xml" in mimetype):
- root = ET.fromstring(raw)
- raw = " ".join(list(root.itertext())) or ""
+ try:
+ root = ET.fromstring(raw)
+ raw = " ".join(list(root.itertext())) or ""
+ except:
+ pass
raw = ftfy.fix_text(raw)
assert raw, "Empty abstract"
return raw
def contrib_name(contrib: ReleaseContrib) -> str:
# TODO: support more cultural normals for name presentation
- if contrib.given_name and contrib.family_name:
- return f"{contrib.given_name} {contrib.family_name}"
+ if contrib.given_name and contrib.surname:
+ return f"{contrib.given_name} {contrib.surname}"
elif contrib.raw_name:
return contrib.raw_name
- elif contrib.family_name:
- return contrib.family_name
+ elif contrib.surname:
+ return contrib.surname
else:
return contrib.given_name
@@ -287,7 +290,8 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
container_issnl=container_issnl,
issns=issns,
- contrib_names=[contrib_name(c) for c in release.contribs if c.index],
+ # TODO; these filters sort of meh. refactor to be above?
+ contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs if c.index])),
contrib_count = len([c for c in release.contribs if c.index]),
affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
)
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 08eadae..b816c5b 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -114,13 +114,13 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
lenient=True,
fields=[
"everything",
- "abstract",
+ "abstracts_all",
"fulltext.body",
"fulltext.annex",
],
)
search = search.highlight(
- "abstract",
+ "abstracts_all",
"fulltext.body",
"fulltext.annex",
number_of_fragments=3,
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index f2c452f..07cac15 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -105,7 +105,7 @@
{% if paper.abstracts[0].body|length > 500 %}
{{ paper.abstracts[0].body[:500] }}...
{% else %}
- {{ paper.abstracts[0].body[:500] }}...
+ {{ paper.abstracts[0].body }}
{% endif %}
</div>
{% else %}
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index d858a4c..ab63aa6 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -159,6 +159,8 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil
def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
obj = teixml2json(tei_xml)
+ if not obj.get('body'):
+ return None
ret = ScholarFulltext(
lang_code=obj.get('lang'),
body=obj.get('body'),
@@ -209,10 +211,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
if heavy.grobid_fulltext:
-
fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
-
fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
# hack to pull through thumbnail from local pdftotext
@@ -221,9 +221,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
if not fulltext and heavy.pdftotext_fulltext:
-
- fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
- fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+ fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0]
+ fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0]
fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
# TODO: additional access list
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index c93cb29..9ce72b1 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -2,6 +2,7 @@
import os
import io
import sys
+import minio
import argparse
from pydantic import BaseModel, validator
from typing import List, Dict, Tuple, Optional, Any, Sequence
@@ -88,14 +89,17 @@ class WorkPipeline():
if not grobid_meta or grobid_meta['status'] != 'success':
return None
#print(grobid_meta)
- grobid_xml = self.sandcrawler_s3_client.get_blob(
- folder="grobid",
- sha1hex=fe.sha1,
- extension=".tei.xml",
- prefix="",
- bucket="sandcrawler",
- )
- #print(grobid_xml)
+ try:
+ grobid_xml = self.sandcrawler_s3_client.get_blob(
+ folder="grobid",
+ sha1hex=fe.sha1,
+ extension=".tei.xml",
+ prefix="",
+ bucket="sandcrawler",
+ )
+ #print(grobid_xml)
+ except minio.error.NoSuchKey:
+ return None
return dict(
tei_xml=grobid_xml,
release_ident=release_ident,
@@ -338,6 +342,7 @@ def main():
access_key=os.environ.get('MINIO_ACCESS_KEY'),
secret_key=os.environ.get('MINIO_SECRET_KEY'),
),
+ fulltext_cache_dir=args.fulltext_cache_dir,
)
if args.func == 'run_releases':
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
index 8024702..d5d7852 100644
--- a/schema/scholar_fulltext.v01.json
+++ b/schema/scholar_fulltext.v01.json
@@ -41,7 +41,7 @@
"dynamic": false,
"_source": {
"excludes": [
- "abstracts.body",
+ "abstracts_all",
"fulltext.body",
"fulltext.acknowledgment",
"fulltext.annex",
@@ -140,7 +140,7 @@
"type": "nested",
"dynamic": false,
"properties": {
- "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"], "store": true },
+ "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"] },
"lang_code": { "type": "keyword", "normalizer": "default" }
}
},