summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-29 20:35:32 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-29 20:35:32 -0700
commit06adf39b0ab38e08d1977127606b784d1a9aca4f (patch)
treea9d3fd4bcdbff89c3849a4cc0a7467d121df293e
parenta0babf64426da4b5c19da224f17c9c8f683ed57d (diff)
downloadfatcat-scholar-06adf39b0ab38e08d1977127606b784d1a9aca4f.tar.gz
fatcat-scholar-06adf39b0ab38e08d1977127606b784d1a9aca4f.zip
include GROBID-extracted abstracts in search documents
-rw-r--r--fatcat_scholar/schema.py8
-rw-r--r--fatcat_scholar/transform.py25
2 files changed, 23 insertions, 10 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index cf88011..35cf9a1 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -255,6 +255,14 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
return None
+def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]:
+
+ if tei_dict.get("abstract"):
+ return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=scrub_text(tei_dict["abstract"]))]
+ else:
+ return []
+
+
def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
d = dict()
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 7cc0ea5..c219528 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -163,16 +163,15 @@ def _add_file_release_meta(
def es_fulltext_from_grobid(
- tei_xml: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
+ tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
) -> Optional[ScholarFulltext]:
- obj = teixml2json(tei_xml)
- if not obj.get("body"):
+ if not tei_dict.get("body"):
return None
ret = ScholarFulltext(
- lang_code=obj.get("lang"),
- body=obj.get("body"),
- acknowledgement=obj.get("acknowledgement"),
- annex=obj.get("annex"),
+ lang_code=tei_dict.get("lang"),
+ body=tei_dict.get("body"),
+ acknowledgement=tei_dict.get("acknowledgement"),
+ annex=tei_dict.get("annex"),
)
return _add_file_release_meta(ret, pdf_meta, re, fe)
@@ -219,9 +218,12 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
r for r in heavy.releases if r.ident == heavy.biblio_release_ident
][0]
biblio = es_biblio_from_release(primary_release)
-
- # TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse
abstracts = es_abstracts_from_release(primary_release)
+
+ # if no abstract from primary_release, try all the other releases
+ for release in heavy.releases:
+ if not abstracts:
+ abstracts = es_abstracts_from_release(release)
else:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
@@ -236,9 +238,12 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
for f in fulltext_release.files
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
+ tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
fulltext = es_fulltext_from_grobid(
- heavy.grobid_fulltext["tei_xml"], heavy.pdf_meta, fulltext_release, fulltext_file
+ tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
)
+ if not abstracts:
+ abstracts = es_abstracts_from_grobid(tei_dict)
if not fulltext and heavy.pdftotext_fulltext:
fulltext_release = [