aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py13
1 files changed, 10 insertions, 3 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 7264540..db631cf 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -246,13 +246,18 @@ def es_fulltext_from_pdftotext(
if raw_text and len(raw_text) > MAX_BODY_CHARS:
raw_text = raw_text[:MAX_BODY_CHARS]
ret = ScholarFulltext(
- lang_code=re.language, body=raw_text, acknowledgement=None, annex=None,
+ lang_code=re.language,
+ body=raw_text,
+ acknowledgement=None,
+ annex=None,
)
return _add_file_release_meta(ret, pdf_meta, re, fe)
def es_fulltext_from_html(
- html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity,
+ html_fulltext: Dict[str, Any],
+ re: ReleaseEntity,
+ wc: WebcaptureEntity,
) -> Optional[ScholarFulltext]:
if not wc.archive_urls or not html_fulltext.get("tei_xml"):
@@ -546,7 +551,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.html_fulltext["webcapture_ident"]
][0]
html_fulltext = es_fulltext_from_html(
- heavy.html_fulltext, fulltext_release, fulltext_webcapture,
+ heavy.html_fulltext,
+ fulltext_release,
+ fulltext_webcapture,
)
if exclude_web_fulltext and html_fulltext:
fulltext = html_fulltext.remove_access()