diff options
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 7264540..db631cf 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -246,13 +246,18 @@ def es_fulltext_from_pdftotext( if raw_text and len(raw_text) > MAX_BODY_CHARS: raw_text = raw_text[:MAX_BODY_CHARS] ret = ScholarFulltext( - lang_code=re.language, body=raw_text, acknowledgement=None, annex=None, + lang_code=re.language, + body=raw_text, + acknowledgement=None, + annex=None, ) return _add_file_release_meta(ret, pdf_meta, re, fe) def es_fulltext_from_html( - html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity, + html_fulltext: Dict[str, Any], + re: ReleaseEntity, + wc: WebcaptureEntity, ) -> Optional[ScholarFulltext]: if not wc.archive_urls or not html_fulltext.get("tei_xml"): @@ -546,7 +551,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.html_fulltext["webcapture_ident"] ][0] html_fulltext = es_fulltext_from_html( - heavy.html_fulltext, fulltext_release, fulltext_webcapture, + heavy.html_fulltext, + fulltext_release, + fulltext_webcapture, ) if exclude_web_fulltext and html_fulltext: fulltext = html_fulltext.remove_access() |