aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 19:10:35 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 19:10:35 -0700
commita0e275a4bad46ef41585f0207d6dfa1e3c38bc35 (patch)
tree92dead8a85e6ff38808beefada8a42693261ceff /python/sandcrawler
parent40adf5ed09d917b8a4b8f75680bbf90c147848b3 (diff)
downloadsandcrawler-a0e275a4bad46ef41585f0207d6dfa1e3c38bc35.tar.gz
sandcrawler-a0e275a4bad46ef41585f0207d6dfa1e3c38bc35.zip
remove grobid2json helper file, replace with grobid_tei_xml
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/grobid.py7
-rw-r--r--python/sandcrawler/persist.py2
2 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 26918f6..37c4ea1 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,8 +1,7 @@
from typing import Any, Dict, Optional
import requests
-
-from grobid2json import teixml2json
+from grobid_tei_xml import parse_document_xml
from .ia import WaybackClient
from .misc import gen_file_metadata
@@ -71,7 +70,9 @@ class GrobidClient(object):
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
if result["status"] != "success":
return None
- tei_json = teixml2json(result["tei_xml"], encumbered=False)
+ tei_doc = parse_document_xml(result["tei_xml"])
+ tei_doc.remove_encumbered()
+ tei_json = tei_doc.to_legacy_dict()
meta = dict()
biblio = dict()
for k in (
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index c8c0c33..f50b9d1 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -395,7 +395,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
)
self.counts["s3-put"] += 1
- # enhance with teixml2json metadata, if available
+ # enhance with GROBID TEI-XML metadata, if available
try:
metadata = self.grobid.metadata(r)
except xml.etree.ElementTree.ParseError as xml_e: