From 5a1d53e8705d5ea59ea0c007f8a53940a353000b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 30 Jun 2021 18:58:20 -0700 Subject: HACK: don't parse TEI-XML for a specific paper/file GROBID v0.5.5 returns TEI-XML for this one PDF which is not valid XML, due to a text encoding issue. --- fatcat_scholar/transform.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 6904770..388f2f5 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -481,7 +481,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: else: raise NotImplementedError(f"doc_type: {heavy.doc_type}") - if heavy.grobid_fulltext: + # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ + if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': fulltext_release = [ r for r in heavy.releases @@ -792,7 +793,8 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: break fulltext_refs: List[RefStructured] = [] - if heavy.grobid_fulltext: + # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ + if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': fulltext_release = [ r for r in heavy.releases -- cgit v1.2.3