diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-03-30 22:53:03 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-03-30 22:53:03 -0700 |
commit | 31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5 (patch) | |
tree | 995bb427a75d0e62c6796dc026c92e63ed410389 /backfill/backfill_hbase_from_cdx.py | |
parent | bb38ea065758a719331803b4adf875f2d75a1c3d (diff) | |
download | sandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.tar.gz sandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.zip |
backfill: sha1 prefix, cluster example
Diffstat (limited to 'backfill/backfill_hbase_from_cdx.py')
-rwxr-xr-x | backfill/backfill_hbase_from_cdx.py | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py index d14dd92..8008761 100755 --- a/backfill/backfill_hbase_from_cdx.py +++ b/backfill/backfill_hbase_from_cdx.py @@ -74,6 +74,8 @@ def transform_line(raw_cdx): if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): return None + key = "sha1:{}".format(key) + info = dict(surt=surt, dt=dt, url=url, c_size=c_size, offset=offset, warc=warc) @@ -92,7 +94,7 @@ def test_transform_line(): raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" correct = { - 'key': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", + 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", 'file:mime': "application/pdf", 'file:cdx': { 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", @@ -150,7 +152,9 @@ class MRCDXBackfillHBase(MRJob): if self.hb_table is None: try: host = self.options.hbase_host - hb_conn = happybase.Connection(host=host) + # TODO: make these configs accessible from... mrconf.cfg? + hb_conn = happybase.Connection(host=host, transport="framed", + protocol="compact") except Exception as err: raise Exception("Couldn't connect to HBase using host: {}".format(host)) self.hb_table = hb_conn.table(self.options.hbase_table) |