diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 14:41:37 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 14:41:37 -0800 |
commit | 1c392ccaec8d6439f2691aeda12be4d2953a3f5e (patch) | |
tree | 9e6da5ddd6ac32e3c27d279d8b470fad214c3f4d | |
parent | 78485352c493c1ccd952de2143e29ae28913ee50 (diff) | |
download | sandcrawler-1c392ccaec8d6439f2691aeda12be4d2953a3f5e.tar.gz sandcrawler-1c392ccaec8d6439f2691aeda12be4d2953a3f5e.zip |
grobid: extract more metadata in document TEI-XML
-rw-r--r-- | python/sandcrawler/grobid.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index f221830..791e0fe 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -95,9 +95,14 @@ class GrobidClient(object): self.host_url + "/api/processFulltextDocument", files={ "input": blob, + }, + data={ "consolidateHeader": consolidate_mode, "consolidateCitations": 0, # too expensive for now "includeRawCitations": 1, + "includeRawAffiliations": 1, + "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"], + "segmentSentences": 1, }, timeout=180.0, ) |