diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-02-24 15:23:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-02-24 15:23:59 -0800 |
commit | 0186b53f819687427a8367d6c5729d10ff25a4be (patch) | |
tree | 273a3896f7af3b33f4f61794e46860deeffbf90a /fatcat_scholar | |
parent | 4b8a31b4db4329e55800485d474bd56ca55c78d0 (diff) | |
download | fatcat-scholar-0186b53f819687427a8367d6c5729d10ff25a4be.tar.gz fatcat-scholar-0186b53f819687427a8367d6c5729d10ff25a4be.zip |
fix body size limit
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/transform.py | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index bf24da9..ca648b2 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -23,7 +23,7 @@ def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]: issue_item = sim["issue_item"] body = "\n".join([p["raw_text"] for p in sim["page_texts"]]) if body and len(body) > MAX_BODY_CHARS: - body = body[MAX_BODY_CHARS:] + body = body[:MAX_BODY_CHARS] return ScholarFulltext( lang_code=None, # TODO: pub/issue metadata? or langdetect? body=body, @@ -228,7 +228,7 @@ def es_fulltext_from_grobid( return None body = tei_dict.get("body") if body and len(body) > MAX_BODY_CHARS: - body = body[MAX_BODY_CHARS:] + body = body[:MAX_BODY_CHARS] ret = ScholarFulltext( lang_code=tei_dict.get("lang"), body=body, @@ -243,7 +243,7 @@ def es_fulltext_from_pdftotext( ) -> Optional[ScholarFulltext]: if raw_text and len(raw_text) > MAX_BODY_CHARS: - raw_text = raw_text[MAX_BODY_CHARS:] + raw_text = raw_text[:MAX_BODY_CHARS] ret = ScholarFulltext( lang_code=re.language, body=raw_text, acknowledgement=None, annex=None, ) @@ -263,7 +263,7 @@ def es_fulltext_from_html( if body: raw_text = " ".join(body.itertext()) if raw_text and len(raw_text) > MAX_BODY_CHARS: - raw_text = raw_text[MAX_BODY_CHARS:] + raw_text = raw_text[:MAX_BODY_CHARS] else: return None |