diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
commit | 5a508d61daf23a4bfa337c4229bbb6795b69fbd2 (patch) | |
tree | 3a8e744411c2db215d666cd600d4679c5a16e9a9 /fatcat_scholar/transform.py | |
parent | c71314e46dcf18905d1957579a211bb47c520d57 (diff) | |
download | fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.tar.gz fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.zip |
fixes from manual testing
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 9 |
1 files changed, 4 insertions, 5 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index d858a4c..ab63aa6 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -159,6 +159,8 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: obj = teixml2json(tei_xml) + if not obj.get('body'): + return None ret = ScholarFulltext( lang_code=obj.get('lang'), body=obj.get('body'), @@ -209,10 +211,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") if heavy.grobid_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] - fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file) # hack to pull through thumbnail from local pdftotext @@ -221,9 +221,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png" if not fulltext and heavy.pdftotext_fulltext: - - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] + fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0] + fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0] fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file) # TODO: additional access list |