diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 17:59:42 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 17:59:42 -0700 | 
| commit | 04e1ae4f903af98ef174be9110aaae5e1ab81360 (patch) | |
| tree | 63ec7652c08a48910d2ad8f5e04ef1ab8ad02671 | |
| parent | 84a501f86dc9a6a2e25e58c7380575ed946c9357 (diff) | |
| download | sandcrawler-04e1ae4f903af98ef174be9110aaae5e1ab81360.tar.gz sandcrawler-04e1ae4f903af98ef174be9110aaae5e1ab81360.zip | |
we do actually want consolidateHeader=2, not 1
| -rwxr-xr-x | python/ingest_file.py | 2 | ||||
| -rw-r--r-- | python/sandcrawler/grobid.py | 6 | 
2 files changed, 4 insertions, 4 deletions
| diff --git a/python/ingest_file.py b/python/ingest_file.py index 0699a0c..4daa472 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -217,7 +217,7 @@ def file_metadata(blob):  def do_grobid(sha1hex, blob):      grobid_response = requests.post(          GROBID_ENDPOINT + "/api/processFulltextDocument", -        files={'input': blob, 'consolidateHeader': '1'}, +        files={'input': blob, 'consolidateHeader': '2'},      )      info = dict( diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 48ff6f8..f157241 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -10,7 +10,7 @@ class GrobidClient(object):      def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):          self.host_url = host_url -        self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) +        self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))      def process_fulltext(self, blob, consolidate_mode=None):          """ @@ -56,7 +56,7 @@ class GrobidWorker(SandcrawlerWorker):          self.grobid_client = grobid_client          self.wayback_client = wayback_client          self.sink = sink -        self.consolidate_mode = 1 +        self.consolidate_mode = 2      def process(self, record):          if record.get('warc_path') and record.get('warc_offset'): @@ -105,7 +105,7 @@ class GrobidBlobWorker(SandcrawlerWorker):          super().__init__()          self.grobid_client = grobid_client          self.sink = sink -        self.consolidate_mode = 1 +        self.consolidate_mode = 2      def process(self, blob):          assert blob | 
