From 1c392ccaec8d6439f2691aeda12be4d2953a3f5e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 14:41:37 -0800 Subject: grobid: extract more metadata in document TEI-XML --- python/sandcrawler/grobid.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index f221830..791e0fe 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -95,9 +95,14 @@ class GrobidClient(object): self.host_url + "/api/processFulltextDocument", files={ "input": blob, + }, + data={ "consolidateHeader": consolidate_mode, "consolidateCitations": 0, # too expensive for now "includeRawCitations": 1, + "includeRawAffiliations": 1, + "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"], + "segmentSentences": 1, }, timeout=180.0, ) -- cgit v1.2.3