diff options
author | bnewbold <bnewbold@archive.org> | 2021-10-04 20:05:21 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-10-04 20:05:21 +0000 |
commit | 57f879c00b00c6cd4051f54662fea3f96f80ad35 (patch) | |
tree | 8dc306d29ea8778fc4553d0ea8ff6e0d6b1b6fbb /please | |
parent | 96033132be8976f0c9483a18dfe4a58bf94b0011 (diff) | |
parent | d71cc4e6cd7381f5f0596af1ce33c1bc744c8644 (diff) | |
download | sandcrawler-57f879c00b00c6cd4051f54662fea3f96f80ad35.tar.gz sandcrawler-57f879c00b00c6cd4051f54662fea3f96f80ad35.zip |
Merge branch 'bnewbold-backfill' into 'master'
CDX Backfill (scalding version)
See merge request webgroup/sandcrawler!12
Diffstat (limited to 'please')
-rwxr-xr-x | please | 22 |
1 files changed, 22 insertions, 0 deletions
@@ -487,6 +487,23 @@ def run_dumpungrobided(args): env=args.env) subprocess.call(cmd, shell=True) +def run_sbackfill(args): + if args.rebuild: + rebuild_scalding() + print("Starting scalding backfill job...") + cmd = """hadoop jar \ + scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \ + com.twitter.scalding.Tool sandcrawler.CdxBackfillJob \ + --hdfs \ + --app.conf.path scalding/ia_cluster.conf \ + --hbase-table wbgrp-journal-extract-0-{env} \ + --zookeeper-hosts {zookeeper_hosts} \ + --cdx-input-path {input_cdx}""".format( + input_cdx=args.input_cdx, + zookeeper_hosts=ZOOKEEPER_HOSTS, + env=args.env) + subprocess.call(cmd, shell=True) + def main(): parser = argparse.ArgumentParser() @@ -506,6 +523,11 @@ def main(): sub_backfill.add_argument('input_cdx', help="full HDFS path of CDX file to backfill") + sub_sbackfill = subparsers.add_parser('sbackfill') + sub_sbackfill.set_defaults(func=run_sbackfill) + sub_sbackfill.add_argument('input_cdx', + help="full HDFS path of CDX file to backfill") + sub_extract = subparsers.add_parser('extract') sub_extract.set_defaults(func=run_extract) sub_extract.add_argument('input_cdx', |