aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-02-24 15:23:59 -0800
committerBryan Newbold <bnewbold@archive.org>2021-02-24 15:23:59 -0800
commit0186b53f819687427a8367d6c5729d10ff25a4be (patch)
tree273a3896f7af3b33f4f61794e46860deeffbf90a
parent4b8a31b4db4329e55800485d474bd56ca55c78d0 (diff)
downloadfatcat-scholar-0186b53f819687427a8367d6c5729d10ff25a4be.tar.gz
fatcat-scholar-0186b53f819687427a8367d6c5729d10ff25a4be.zip
fix body size limit
-rw-r--r--fatcat_scholar/transform.py8
1 files changed, 4 insertions, 4 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index bf24da9..ca648b2 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -23,7 +23,7 @@ def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
issue_item = sim["issue_item"]
body = "\n".join([p["raw_text"] for p in sim["page_texts"]])
if body and len(body) > MAX_BODY_CHARS:
- body = body[MAX_BODY_CHARS:]
+ body = body[:MAX_BODY_CHARS]
return ScholarFulltext(
lang_code=None, # TODO: pub/issue metadata? or langdetect?
body=body,
@@ -228,7 +228,7 @@ def es_fulltext_from_grobid(
return None
body = tei_dict.get("body")
if body and len(body) > MAX_BODY_CHARS:
- body = body[MAX_BODY_CHARS:]
+ body = body[:MAX_BODY_CHARS]
ret = ScholarFulltext(
lang_code=tei_dict.get("lang"),
body=body,
@@ -243,7 +243,7 @@ def es_fulltext_from_pdftotext(
) -> Optional[ScholarFulltext]:
if raw_text and len(raw_text) > MAX_BODY_CHARS:
- raw_text = raw_text[MAX_BODY_CHARS:]
+ raw_text = raw_text[:MAX_BODY_CHARS]
ret = ScholarFulltext(
lang_code=re.language, body=raw_text, acknowledgement=None, annex=None,
)
@@ -263,7 +263,7 @@ def es_fulltext_from_html(
if body:
raw_text = " ".join(body.itertext())
if raw_text and len(raw_text) > MAX_BODY_CHARS:
- raw_text = raw_text[MAX_BODY_CHARS:]
+ raw_text = raw_text[:MAX_BODY_CHARS]
else:
return None