diff options
-rw-r--r-- | fatcat_scholar/schema.py | 18 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/templates/search_macros.html | 2 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 9 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 21 | ||||
-rw-r--r-- | schema/scholar_fulltext.v01.json | 4 |
6 files changed, 33 insertions, 25 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index aa4ed52..74c80c8 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -186,20 +186,23 @@ def scrub_text(raw: str, mimetype: str = None) -> str: TODO: barely implemented yet """ if "<jats" in raw or (mimetype and "application/xml" in mimetype): - root = ET.fromstring(raw) - raw = " ".join(list(root.itertext())) or "" + try: + root = ET.fromstring(raw) + raw = " ".join(list(root.itertext())) or "" + except: + pass raw = ftfy.fix_text(raw) assert raw, "Empty abstract" return raw def contrib_name(contrib: ReleaseContrib) -> str: # TODO: support more cultural normals for name presentation - if contrib.given_name and contrib.family_name: - return f"{contrib.given_name} {contrib.family_name}" + if contrib.given_name and contrib.surname: + return f"{contrib.given_name} {contrib.surname}" elif contrib.raw_name: return contrib.raw_name - elif contrib.family_name: - return contrib.family_name + elif contrib.surname: + return contrib.surname else: return contrib.given_name @@ -287,7 +290,8 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: container_issnl=container_issnl, issns=issns, - contrib_names=[contrib_name(c) for c in release.contribs if c.index], + # TODO; these filters sort of meh. refactor to be above? + contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs if c.index])), contrib_count = len([c for c in release.contribs if c.index]), affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])), ) diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 08eadae..b816c5b 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -114,13 +114,13 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None lenient=True, fields=[ "everything", - "abstract", + "abstracts_all", "fulltext.body", "fulltext.annex", ], ) search = search.highlight( - "abstract", + "abstracts_all", "fulltext.body", "fulltext.annex", number_of_fragments=3, diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index f2c452f..07cac15 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -105,7 +105,7 @@ {% if paper.abstracts[0].body|length > 500 %} {{ paper.abstracts[0].body[:500] }}... {% else %} - {{ paper.abstracts[0].body[:500] }}... + {{ paper.abstracts[0].body }} {% endif %} </div> {% else %} diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index d858a4c..ab63aa6 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -159,6 +159,8 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: obj = teixml2json(tei_xml) + if not obj.get('body'): + return None ret = ScholarFulltext( lang_code=obj.get('lang'), body=obj.get('body'), @@ -209,10 +211,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") if heavy.grobid_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] - fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file) # hack to pull through thumbnail from local pdftotext @@ -221,9 +221,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png" if not fulltext and heavy.pdftotext_fulltext: - - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] + fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0] + fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0] fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file) # TODO: additional access list diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index c93cb29..9ce72b1 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -2,6 +2,7 @@ import os import io import sys +import minio import argparse from pydantic import BaseModel, validator from typing import List, Dict, Tuple, Optional, Any, Sequence @@ -88,14 +89,17 @@ class WorkPipeline(): if not grobid_meta or grobid_meta['status'] != 'success': return None #print(grobid_meta) - grobid_xml = self.sandcrawler_s3_client.get_blob( - folder="grobid", - sha1hex=fe.sha1, - extension=".tei.xml", - prefix="", - bucket="sandcrawler", - ) - #print(grobid_xml) + try: + grobid_xml = self.sandcrawler_s3_client.get_blob( + folder="grobid", + sha1hex=fe.sha1, + extension=".tei.xml", + prefix="", + bucket="sandcrawler", + ) + #print(grobid_xml) + except minio.error.NoSuchKey: + return None return dict( tei_xml=grobid_xml, release_ident=release_ident, @@ -338,6 +342,7 @@ def main(): access_key=os.environ.get('MINIO_ACCESS_KEY'), secret_key=os.environ.get('MINIO_SECRET_KEY'), ), + fulltext_cache_dir=args.fulltext_cache_dir, ) if args.func == 'run_releases': diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 8024702..d5d7852 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -41,7 +41,7 @@ "dynamic": false, "_source": { "excludes": [ - "abstracts.body", + "abstracts_all", "fulltext.body", "fulltext.acknowledgment", "fulltext.annex", @@ -140,7 +140,7 @@ "type": "nested", "dynamic": false, "properties": { - "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"], "store": true }, + "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["everything", "abstracts_all"] }, "lang_code": { "type": "keyword", "normalizer": "default" } } }, |