diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-02-15 21:53:38 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-02-15 21:53:40 -0800 |
commit | cd3e05ac8dc98d87d50c67c28968fa228ea2d016 (patch) | |
tree | f48df3ee35b9b4f1c5ced5517722c558ea54c67d /fatcat_scholar | |
parent | 55bd186a21f5e3703e8f3ba3b0a14f1387ed0ccc (diff) | |
download | fatcat-scholar-cd3e05ac8dc98d87d50c67c28968fa228ea2d016.tar.gz fatcat-scholar-cd3e05ac8dc98d87d50c67c28968fa228ea2d016.zip |
truncate indexed fulltext body at 1 MByte
There was a large ~4 MByte document getting indexed
(work_lumgqw4vqbgvha2ejbsbaepedq) with several megabytes of text, and
this was causing elasticsearch indexing timeouts.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/transform.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index cbf0a89..9088b16 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -13,15 +13,19 @@ from fatcat_scholar.schema import * from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.grobid2json import teixml2json +MAX_BODY_CHARS = 1024*1024 def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]: if not sim["page_texts"]: return None first_page = sim["page_texts"][0]["page_num"] issue_item = sim["issue_item"] + body="\n".join([p["raw_text"] for p in sim["page_texts"]]) + if body and len(body) > MAX_BODY_CHARS: + body = body[MAX_BODY_CHARS:] return ScholarFulltext( lang_code=None, # TODO: pub/issue metadata? or langdetect? - body="\n".join([p["raw_text"] for p in sim["page_texts"]]), + body=body, # acknowledgement=None, # annex=None, release_ident=sim.get("release_ident"), @@ -221,9 +225,12 @@ def es_fulltext_from_grobid( ) -> Optional[ScholarFulltext]: if not tei_dict.get("body"): return None + body = tei_dict.get("body") + if body and len(body) > MAX_BODY_CHARS: + body = body[MAX_BODY_CHARS:] ret = ScholarFulltext( lang_code=tei_dict.get("lang"), - body=tei_dict.get("body"), + body=body, acknowledgement=tei_dict.get("acknowledgement"), annex=tei_dict.get("annex"), ) @@ -234,6 +241,8 @@ def es_fulltext_from_pdftotext( raw_text: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> Optional[ScholarFulltext]: + if raw_text and len(raw_text) > MAX_BODY_CHARS: + raw_text = raw_text[MAX_BODY_CHARS:] ret = ScholarFulltext( lang_code=re.language, body=raw_text, acknowledgement=None, annex=None, ) @@ -252,6 +261,8 @@ def es_fulltext_from_html( body = tree.find(".//tei:body", ns) if body: raw_text = " ".join(body.itertext()) + if raw_text and len(raw_text) > MAX_BODY_CHARS: + raw_text = raw_text[MAX_BODY_CHARS:] else: return None |