aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-03-30 22:53:03 -0700
committerBryan Newbold <bnewbold@archive.org>2018-03-30 22:53:03 -0700
commit31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5 (patch)
tree995bb427a75d0e62c6796dc026c92e63ed410389
parentbb38ea065758a719331803b4adf875f2d75a1c3d (diff)
downloadsandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.tar.gz
sandcrawler-31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5.zip
backfill: sha1 prefix, cluster example
-rw-r--r--backfill/README.md8
-rwxr-xr-xbackfill/backfill_hbase_from_cdx.py8
-rw-r--r--backfill/tests/test_backfill_hbase_from_cdx.py11
3 files changed, 19 insertions, 8 deletions
diff --git a/backfill/README.md b/backfill/README.md
index 90b4ba7..6af8f33 100644
--- a/backfill/README.md
+++ b/backfill/README.md
@@ -3,6 +3,10 @@ Run tests:
pipenv run python -m pytest
-Run locally on a file:
+An example actually connecting to HBase from a local machine, with thrift
+running on a devbox:
+
+ ./backfill_hbase_from_cdx.py tests/files/example.cdx \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host bnewbold-dev.us.archive.org
- ./backfill_hbase_from_cdx.py tests/files/example.cdx
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py
index d14dd92..8008761 100755
--- a/backfill/backfill_hbase_from_cdx.py
+++ b/backfill/backfill_hbase_from_cdx.py
@@ -74,6 +74,8 @@ def transform_line(raw_cdx):
if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
return None
+ key = "sha1:{}".format(key)
+
info = dict(surt=surt, dt=dt, url=url, c_size=c_size, offset=offset,
warc=warc)
@@ -92,7 +94,7 @@ def test_transform_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'key': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
'file:mime': "application/pdf",
'file:cdx': {
'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
@@ -150,7 +152,9 @@ class MRCDXBackfillHBase(MRJob):
if self.hb_table is None:
try:
host = self.options.hbase_host
- hb_conn = happybase.Connection(host=host)
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
except Exception as err:
raise Exception("Couldn't connect to HBase using host: {}".format(host))
self.hb_table = hb_conn.table(self.options.hbase_table)
diff --git a/backfill/tests/test_backfill_hbase_from_cdx.py b/backfill/tests/test_backfill_hbase_from_cdx.py
index d8277be..9af5b05 100644
--- a/backfill/tests/test_backfill_hbase_from_cdx.py
+++ b/backfill/tests/test_backfill_hbase_from_cdx.py
@@ -1,3 +1,6 @@
+"""
+TODO: could probably refactor to use unittest.mock.patch('happybase')
+"""
import io
import json
@@ -33,13 +36,13 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo
assert job.hb_table.row(b'1') == {}
# HTTP 301
- assert job.hb_table.row(b'3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
+ assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
# valid
- assert job.hb_table.row(b'MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {}
+ assert job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {}
# text/plain
- assert job.hb_table.row(b'6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {}
+ assert job.hb_table.row(b'sha1:6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {}
- row = job.hb_table.row(b'MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J')
+ row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J')
assert row[b'file:mime'] == b"application/pdf"
file_cdx = json.loads(row[b'file:cdx'].decode('utf-8'))