From 055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 4 Oct 2019 15:23:24 -0700 Subject: disable citation consolidation by default with this consolidation enabled, the glutton_fatcat elasticsearch server was totally pegged over 90% CPU with only 10 PDF worker threads; the glutton load seemed to be the bottleneck even for this low degree of parallelism. Disabled for now, will debug with GROBID/glutton folks. --- python/sandcrawler/grobid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 78ffa0a..98ae6ae 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -32,7 +32,7 @@ class GrobidClient(object): files={ 'input': blob, 'consolidateHeaders': self.consolidate_mode, - 'consolidateCitations': self.consolidate_mode, + 'consolidateCitations': 0, # too expensive for now 'includeRawCitations': 1, } ) -- cgit v1.2.3