insertable flag for match-crossref

author: Bryan Newbold <bnewbold@archive.org> 2018-09-12 15:32:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-09-12 15:32:17 -0700
commit: 31537f21333cda37458cfc88331feaecbd1d72c8 (patch)
tree: d73eab7bb144f8c98e8a5924ef5d89f9405a7035
parent: e5db9bc783c1157984c160d155c44d38b84b57ce (diff)
download: sandcrawler-31537f21333cda37458cfc88331feaecbd1d72c8.tar.gz
sandcrawler-31537f21333cda37458cfc88331feaecbd1d72c8.zip
1 files changed, 9 insertions, 1 deletions
diff --git a/please b/please
index 397feac..464f9dc 100755
--- a/please
+++ b/please
@@ -176,6 +176,10 @@ def run_matchcrossref(args):
         HDFS_DIR,
         args.env,
         datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+    if args.fatcat_insertable:
+        jobclass = "ScoreInsertableJob"
+    else:
+        jobclass = "ScoreJob"
     # Notes: -D options must come after Tool but before class name
     # https://github.com/twitter/scalding/wiki/Frequently-asked-questions#how-do-i-pass-parameters-to-my-hadoop-job-number-of-reducers--memory-options--etc-
     # Compression: changed due to errors in production
@@ -187,7 +191,7 @@ def run_matchcrossref(args):
         -Dcascading.spill.list.threshold=500000 \
         -D mapred.output.compress=false \
         -Dmapred.compress.map.output=true\
-        sandcrawler.ScoreJob \
+        sandcrawler.{jobclass} \
         --hdfs \
         --app.conf.path scalding/ia_cluster.conf \
         --hbase-table wbgrp-journal-extract-0-{env} \
@@ -195,6 +199,7 @@ def run_matchcrossref(args):
         --crossref-input {crossref_input} \
         --output {output}""".format(
             output=output,
+            jobclass=jobclass,
             zookeeper_hosts=ZOOKEEPER_HOSTS,
             env=args.env,
             reducers=args.reducers,
@@ -354,6 +359,9 @@ def main():
     sub_matchcrossref.add_argument('--reducers',
         help="number of reducers to run",
         type=int, default=200)
+    sub_matchcrossref.add_argument('--fatcat-insertable',
+        help="whether to include CDX and other metadata in output",
+        action='store_true')
 
     sub_grobidscorabledump = subparsers.add_parser('grobid-scorable-dump')
     sub_grobidscorabledump.set_defaults(func=run_grobidscorabledump)
author	Bryan Newbold <bnewbold@archive.org>	2018-09-12 15:32:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-09-12 15:32:17 -0700
commit	31537f21333cda37458cfc88331feaecbd1d72c8 (patch)
tree	d73eab7bb144f8c98e8a5924ef5d89f9405a7035
parent	e5db9bc783c1157984c160d155c44d38b84b57ce (diff)
download	sandcrawler-31537f21333cda37458cfc88331feaecbd1d72c8.tar.gz sandcrawler-31537f21333cda37458cfc88331feaecbd1d72c8.zip