aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 17:59:42 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 17:59:42 -0700
commit04e1ae4f903af98ef174be9110aaae5e1ab81360 (patch)
tree63ec7652c08a48910d2ad8f5e04ef1ab8ad02671 /python
parent84a501f86dc9a6a2e25e58c7380575ed946c9357 (diff)
downloadsandcrawler-04e1ae4f903af98ef174be9110aaae5e1ab81360.tar.gz
sandcrawler-04e1ae4f903af98ef174be9110aaae5e1ab81360.zip
we do actually want consolidateHeader=2, not 1
Diffstat (limited to 'python')
-rwxr-xr-xpython/ingest_file.py2
-rw-r--r--python/sandcrawler/grobid.py6
2 files changed, 4 insertions, 4 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
index 0699a0c..4daa472 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -217,7 +217,7 @@ def file_metadata(blob):
def do_grobid(sha1hex, blob):
grobid_response = requests.post(
GROBID_ENDPOINT + "/api/processFulltextDocument",
- files={'input': blob, 'consolidateHeader': '1'},
+ files={'input': blob, 'consolidateHeader': '2'},
)
info = dict(
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 48ff6f8..f157241 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -10,7 +10,7 @@ class GrobidClient(object):
def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
+ self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))
def process_fulltext(self, blob, consolidate_mode=None):
"""
@@ -56,7 +56,7 @@ class GrobidWorker(SandcrawlerWorker):
self.grobid_client = grobid_client
self.wayback_client = wayback_client
self.sink = sink
- self.consolidate_mode = 1
+ self.consolidate_mode = 2
def process(self, record):
if record.get('warc_path') and record.get('warc_offset'):
@@ -105,7 +105,7 @@ class GrobidBlobWorker(SandcrawlerWorker):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 1
+ self.consolidate_mode = 2
def process(self, blob):
assert blob