aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 15:23:24 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 15:23:26 -0700
commit055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72 (patch)
treea493781d308244836f24b1d085923c743f91fbf9
parentab1f3fe806e40122a93c975d2253f2c14035952e (diff)
downloadsandcrawler-055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72.tar.gz
sandcrawler-055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72.zip
disable citation consolidation by default
with this consolidation enabled, the glutton_fatcat elasticsearch server was totally pegged over 90% CPU with only 10 PDF worker threads; the glutton load seemed to be the bottleneck even for this low degree of parallelism. Disabled for now, will debug with GROBID/glutton folks.
-rw-r--r--python/sandcrawler/grobid.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 78ffa0a..98ae6ae 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -32,7 +32,7 @@ class GrobidClient(object):
files={
'input': blob,
'consolidateHeaders': self.consolidate_mode,
- 'consolidateCitations': self.consolidate_mode,
+ 'consolidateCitations': 0, # too expensive for now
'includeRawCitations': 1,
}
)