From 04e1ae4f903af98ef174be9110aaae5e1ab81360 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 4 Oct 2019 17:59:42 -0700 Subject: we do actually want consolidateHeader=2, not 1 --- python/ingest_file.py | 2 +- python/sandcrawler/grobid.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ingest_file.py b/python/ingest_file.py index 0699a0c..4daa472 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -217,7 +217,7 @@ def file_metadata(blob): def do_grobid(sha1hex, blob): grobid_response = requests.post( GROBID_ENDPOINT + "/api/processFulltextDocument", - files={'input': blob, 'consolidateHeader': '1'}, + files={'input': blob, 'consolidateHeader': '2'}, ) info = dict( diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 48ff6f8..f157241 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -10,7 +10,7 @@ class GrobidClient(object): def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs): self.host_url = host_url - self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) + self.consolidate_mode = int(kwargs.get('consolidate_mode', 2)) def process_fulltext(self, blob, consolidate_mode=None): """ @@ -56,7 +56,7 @@ class GrobidWorker(SandcrawlerWorker): self.grobid_client = grobid_client self.wayback_client = wayback_client self.sink = sink - self.consolidate_mode = 1 + self.consolidate_mode = 2 def process(self, record): if record.get('warc_path') and record.get('warc_offset'): @@ -105,7 +105,7 @@ class GrobidBlobWorker(SandcrawlerWorker): super().__init__() self.grobid_client = grobid_client self.sink = sink - self.consolidate_mode = 1 + self.consolidate_mode = 2 def process(self, blob): assert blob -- cgit v1.2.3