From a0e275a4bad46ef41585f0207d6dfa1e3c38bc35 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 19:10:35 -0700 Subject: remove grobid2json helper file, replace with grobid_tei_xml --- python/sandcrawler/grobid.py | 7 ++++--- python/sandcrawler/persist.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 26918f6..37c4ea1 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -1,8 +1,7 @@ from typing import Any, Dict, Optional import requests - -from grobid2json import teixml2json +from grobid_tei_xml import parse_document_xml from .ia import WaybackClient from .misc import gen_file_metadata @@ -71,7 +70,9 @@ class GrobidClient(object): def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: if result["status"] != "success": return None - tei_json = teixml2json(result["tei_xml"], encumbered=False) + tei_doc = parse_document_xml(result["tei_xml"]) + tei_doc.remove_encumbered() + tei_json = tei_doc.to_legacy_dict() meta = dict() biblio = dict() for k in ( diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index c8c0c33..f50b9d1 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -395,7 +395,7 @@ class PersistGrobidWorker(SandcrawlerWorker): ) self.counts["s3-put"] += 1 - # enhance with teixml2json metadata, if available + # enhance with GROBID TEI-XML metadata, if available try: metadata = self.grobid.metadata(r) except xml.etree.ElementTree.ParseError as xml_e: -- cgit v1.2.3