diff options
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
| -rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 28 | 
1 files changed, 12 insertions, 16 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 0ba95e6..a4a13f8 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -53,15 +53,9 @@ class MRExtractCdxGrobid(MRJob):                                help='URI of GROBID API Server')      def __init__(self, *args, **kwargs): - -        # Allow passthrough for tests -        if 'hb_table' in kwargs: -            self.hb_table = kwargs.pop('hb_table') -        else: -            self.hb_table = None -          super(MRExtractCdxGrobid, self).__init__(*args, **kwargs)          self.mime_filter = ['application/pdf'] +        self.hb_table = None      def grobid_process_fulltext(self, content):          r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument", @@ -73,15 +67,17 @@ class MRExtractCdxGrobid(MRJob):      def mapper_init(self): -        if self.hb_table is None: -            try: -                host = self.options.hbase_host -                # TODO: make these configs accessible from... mrconf.cfg? -                hb_conn = happybase.Connection(host=host, transport="framed", -                    protocol="compact") -            except Exception as err: -                raise Exception("Couldn't connect to HBase using host: {}".format(host)) -            self.hb_table = hb_conn.table(self.options.hbase_table) +        if self.hb_table: +            return + +        try: +            host = self.options.hbase_host +            # TODO: make these configs accessible from... mrconf.cfg? +            hb_conn = happybase.Connection(host=host, transport="framed", +                protocol="compact") +        except Exception as err: +            raise Exception("Couldn't connect to HBase using host: {}".format(host)) +        self.hb_table = hb_conn.table(self.options.hbase_table)      def parse_line(self, raw_cdx):  | 
