diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-03-30 22:53:03 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-03-30 22:53:03 -0700 |
commit | 31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5 (patch) | |
tree | 995bb427a75d0e62c6796dc026c92e63ed410389 | |
parent | bb38ea065758a719331803b4adf875f2d75a1c3d (diff) | |
download | sandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.tar.gz sandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.zip |
backfill: sha1 prefix, cluster example
-rw-r--r-- | backfill/README.md | 8 | ||||
-rwxr-xr-x | backfill/backfill_hbase_from_cdx.py | 8 | ||||
-rw-r--r-- | backfill/tests/test_backfill_hbase_from_cdx.py | 11 |
3 files changed, 19 insertions, 8 deletions
diff --git a/backfill/README.md b/backfill/README.md index 90b4ba7..6af8f33 100644 --- a/backfill/README.md +++ b/backfill/README.md @@ -3,6 +3,10 @@ Run tests: pipenv run python -m pytest -Run locally on a file: +An example actually connecting to HBase from a local machine, with thrift +running on a devbox: + + ./backfill_hbase_from_cdx.py tests/files/example.cdx \ + --hbase-table wbgrp-journal-extract-0-qa \ + --hbase-host bnewbold-dev.us.archive.org - ./backfill_hbase_from_cdx.py tests/files/example.cdx diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py index d14dd92..8008761 100755 --- a/backfill/backfill_hbase_from_cdx.py +++ b/backfill/backfill_hbase_from_cdx.py @@ -74,6 +74,8 @@ def transform_line(raw_cdx): if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): return None + key = "sha1:{}".format(key) + info = dict(surt=surt, dt=dt, url=url, c_size=c_size, offset=offset, warc=warc) @@ -92,7 +94,7 @@ def test_transform_line(): raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" correct = { - 'key': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", + 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", 'file:mime': "application/pdf", 'file:cdx': { 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", @@ -150,7 +152,9 @@ class MRCDXBackfillHBase(MRJob): if self.hb_table is None: try: host = self.options.hbase_host - hb_conn = happybase.Connection(host=host) + # TODO: make these configs accessible from... mrconf.cfg? + hb_conn = happybase.Connection(host=host, transport="framed", + protocol="compact") except Exception as err: raise Exception("Couldn't connect to HBase using host: {}".format(host)) self.hb_table = hb_conn.table(self.options.hbase_table) diff --git a/backfill/tests/test_backfill_hbase_from_cdx.py b/backfill/tests/test_backfill_hbase_from_cdx.py index d8277be..9af5b05 100644 --- a/backfill/tests/test_backfill_hbase_from_cdx.py +++ b/backfill/tests/test_backfill_hbase_from_cdx.py @@ -1,3 +1,6 @@ +""" +TODO: could probably refactor to use unittest.mock.patch('happybase') +""" import io import json @@ -33,13 +36,13 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo assert job.hb_table.row(b'1') == {} # HTTP 301 - assert job.hb_table.row(b'3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {} + assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {} # valid - assert job.hb_table.row(b'MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {} + assert job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {} # text/plain - assert job.hb_table.row(b'6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {} + assert job.hb_table.row(b'sha1:6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {} - row = job.hb_table.row(b'MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') + row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') assert row[b'file:mime'] == b"application/pdf" file_cdx = json.loads(row[b'file:cdx'].decode('utf-8')) |