aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-10 14:41:37 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-10 14:41:37 -0800
commit1c392ccaec8d6439f2691aeda12be4d2953a3f5e (patch)
tree9e6da5ddd6ac32e3c27d279d8b470fad214c3f4d
parent78485352c493c1ccd952de2143e29ae28913ee50 (diff)
downloadsandcrawler-1c392ccaec8d6439f2691aeda12be4d2953a3f5e.tar.gz
sandcrawler-1c392ccaec8d6439f2691aeda12be4d2953a3f5e.zip
grobid: extract more metadata in document TEI-XML
-rw-r--r--python/sandcrawler/grobid.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f221830..791e0fe 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -95,9 +95,14 @@ class GrobidClient(object):
self.host_url + "/api/processFulltextDocument",
files={
"input": blob,
+ },
+ data={
"consolidateHeader": consolidate_mode,
"consolidateCitations": 0, # too expensive for now
"includeRawCitations": 1,
+ "includeRawAffiliations": 1,
+ "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"],
+ "segmentSentences": 1,
},
timeout=180.0,
)