aboutsummaryrefslogtreecommitdiffstats
path: root/please
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-26 05:15:21 +0000
committerBryan Newbold <bnewbold@archive.org>2018-08-26 05:15:21 +0000
commita71d556763b4031bfa0e56abc72348d7f1d3d966 (patch)
treecb7750cfb29197bcb9c6b67afe1fc04ed7d6a12b /please
parent45cc6f57f8c487f53a2946922acbc3519c0e25ee (diff)
downloadsandcrawler-a71d556763b4031bfa0e56abc72348d7f1d3d966.tar.gz
sandcrawler-a71d556763b4031bfa0e56abc72348d7f1d3d966.zip
please: save extraction output
Diffstat (limited to 'please')
-rwxr-xr-xplease6
1 files changed, 6 insertions, 0 deletions
diff --git a/please b/please
index 81aad4d..a2658ab 100755
--- a/please
+++ b/please
@@ -64,12 +64,15 @@ def run_extract(args):
--grobid-uri {grobid_uri} \
-r hadoop \
-c mrjob.conf \
+ --output-dir {output} \
+ --no-output \
--archive venv-current.tar.gz#venv \
--jobconf mapred.line.input.format.linespermap=8000 \
--jobconf mapreduce.job.queuename=extraction \
--jobconf mapred.task.timeout=3600000 \
{input_cdx}
""".format(hbase_host=HBASE_HOST, env=args.env,
+ output=output,
input_cdx=args.input_cdx,
grobid_uri=GROBID_URI)
subprocess.call(cmd, shell=True)
@@ -89,6 +92,8 @@ def run_extract_ungrobided(args):
--grobid-uri {grobid_uri} \
-r hadoop \
-c mrjob.conf \
+ --output-dir {output} \
+ --no-output \
--archive venv-current.tar.gz#venv \
--jobconf mapred.line.input.format.linespermap=8000 \
--jobconf mapreduce.job.queuename=extraction \
@@ -96,6 +101,7 @@ def run_extract_ungrobided(args):
{input_ungrobided}
""".format(hbase_host=HBASE_HOST, env=args.env,
input_ungrobided=args.input_ungrobided,
+ output=output,
grobid_uri=GROBID_URI)
subprocess.call(cmd, shell=True)