diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 20:35:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 20:35:32 -0700 |
commit | 06adf39b0ab38e08d1977127606b784d1a9aca4f (patch) | |
tree | a9d3fd4bcdbff89c3849a4cc0a7467d121df293e /fatcat_scholar | |
parent | a0babf64426da4b5c19da224f17c9c8f683ed57d (diff) | |
download | fatcat-scholar-06adf39b0ab38e08d1977127606b784d1a9aca4f.tar.gz fatcat-scholar-06adf39b0ab38e08d1977127606b784d1a9aca4f.zip |
include GROBID-extracted abstracts in search documents
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/schema.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 25 |
2 files changed, 23 insertions, 10 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index cf88011..35cf9a1 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -255,6 +255,14 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]: return None +def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]: + + if tei_dict.get("abstract"): + return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=scrub_text(tei_dict["abstract"]))] + else: + return [] + + def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: d = dict() diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 7cc0ea5..c219528 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -163,16 +163,15 @@ def _add_file_release_meta( def es_fulltext_from_grobid( - tei_xml: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity + tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> Optional[ScholarFulltext]: - obj = teixml2json(tei_xml) - if not obj.get("body"): + if not tei_dict.get("body"): return None ret = ScholarFulltext( - lang_code=obj.get("lang"), - body=obj.get("body"), - acknowledgement=obj.get("acknowledgement"), - annex=obj.get("annex"), + lang_code=tei_dict.get("lang"), + body=tei_dict.get("body"), + acknowledgement=tei_dict.get("acknowledgement"), + annex=tei_dict.get("annex"), ) return _add_file_release_meta(ret, pdf_meta, re, fe) @@ -219,9 +218,12 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: r for r in heavy.releases if r.ident == heavy.biblio_release_ident ][0] biblio = es_biblio_from_release(primary_release) - - # TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse abstracts = es_abstracts_from_release(primary_release) + + # if no abstract from primary_release, try all the other releases + for release in heavy.releases: + if not abstracts: + abstracts = es_abstracts_from_release(release) else: raise NotImplementedError(f"doc_type: {heavy.doc_type}") @@ -236,9 +238,12 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: for f in fulltext_release.files if f.ident == heavy.grobid_fulltext["file_ident"] ][0] + tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) fulltext = es_fulltext_from_grobid( - heavy.grobid_fulltext["tei_xml"], heavy.pdf_meta, fulltext_release, fulltext_file + tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file ) + if not abstracts: + abstracts = es_abstracts_from_grobid(tei_dict) if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ |