summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-06-30 18:58:20 -0700
committerBryan Newbold <bnewbold@archive.org>2021-06-30 18:58:22 -0700
commit5a1d53e8705d5ea59ea0c007f8a53940a353000b (patch)
treef2dd1f9997e8569ecda641b43466f73fad4aaedf
parent8a34f3747a6881eab66775388a9b6d7878c00a6c (diff)
downloadfatcat-scholar-5a1d53e8705d5ea59ea0c007f8a53940a353000b.tar.gz
fatcat-scholar-5a1d53e8705d5ea59ea0c007f8a53940a353000b.zip
HACK: don't parse TEI-XML for a specific paper/file
GROBID v0.5.5 returns TEI-XML for this one PDF which is not valid XML, due to a text encoding issue.
-rw-r--r--fatcat_scholar/transform.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 6904770..388f2f5 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -481,7 +481,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
else:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
- if heavy.grobid_fulltext:
+ # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
+ if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
fulltext_release = [
r
for r in heavy.releases
@@ -792,7 +793,8 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
break
fulltext_refs: List[RefStructured] = []
- if heavy.grobid_fulltext:
+ # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
+ if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
fulltext_release = [
r
for r in heavy.releases