aboutsummaryrefslogtreecommitdiffstats
path: root/please
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-10-04 20:05:21 +0000
committerbnewbold <bnewbold@archive.org>2021-10-04 20:05:21 +0000
commit57f879c00b00c6cd4051f54662fea3f96f80ad35 (patch)
tree8dc306d29ea8778fc4553d0ea8ff6e0d6b1b6fbb /please
parent96033132be8976f0c9483a18dfe4a58bf94b0011 (diff)
parentd71cc4e6cd7381f5f0596af1ce33c1bc744c8644 (diff)
downloadsandcrawler-57f879c00b00c6cd4051f54662fea3f96f80ad35.tar.gz
sandcrawler-57f879c00b00c6cd4051f54662fea3f96f80ad35.zip
Merge branch 'bnewbold-backfill' into 'master'
CDX Backfill (scalding version) See merge request webgroup/sandcrawler!12
Diffstat (limited to 'please')
-rwxr-xr-xplease22
1 files changed, 22 insertions, 0 deletions
diff --git a/please b/please
index 4800112..298a1c5 100755
--- a/please
+++ b/please
@@ -487,6 +487,23 @@ def run_dumpungrobided(args):
env=args.env)
subprocess.call(cmd, shell=True)
+def run_sbackfill(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting scalding backfill job...")
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.CdxBackfillJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --cdx-input-path {input_cdx}""".format(
+ input_cdx=args.input_cdx,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
def main():
parser = argparse.ArgumentParser()
@@ -506,6 +523,11 @@ def main():
sub_backfill.add_argument('input_cdx',
help="full HDFS path of CDX file to backfill")
+ sub_sbackfill = subparsers.add_parser('sbackfill')
+ sub_sbackfill.set_defaults(func=run_sbackfill)
+ sub_sbackfill.add_argument('input_cdx',
+ help="full HDFS path of CDX file to backfill")
+
sub_extract = subparsers.add_parser('extract')
sub_extract.set_defaults(func=run_extract)
sub_extract.add_argument('input_cdx',