From 7c81b7bea3d670876faff1eb290c40656697dddb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Mar 2018 20:16:05 -0700 Subject: move to top level --- cdx-record-pipeline/README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 cdx-record-pipeline/README.md (limited to 'cdx-record-pipeline/README.md') diff --git a/cdx-record-pipeline/README.md b/cdx-record-pipeline/README.md new file mode 100644 index 0000000..797b8eb --- /dev/null +++ b/cdx-record-pipeline/README.md @@ -0,0 +1,33 @@ +CDX Record Pipeline (GrobId Edition) +===================================== + +Hadoop based pipeline to process PDFs from a specified IA CDX dataset + +## Local mode example ## + +``` +cat -n /home/bnewbold/100k_random_gwb_pdf.cdx | ./cdx-record-pipeline.py + +``` + +## Cluster mode example ## + +``` +input=100k_random_gwb_pdf.cdx +output=100k_random_gwb_pdf.out +lines_per_map=1000 + +hadoop jar /home/webcrawl/hadoop-2/hadoop-mapreduce/hadoop-streaming.jar + -archives "hdfs://ia802400.us.archive.org:6000/lib/cdx-record-pipeline-venv.zip#cdx-record-pipeline-venv" + -D mapred.reduce.tasks=0 + -D mapred.job.name=Cdx-Record-Pipeline + -D mapreduce.job.queuename=extraction + -D mapred.line.input.format.linespermap=${lines_per_map} + -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat + -input ${input} + -output ${output} + -mapper cdx-record-pipeline.py + -file cdx-record-pipeline.py + +``` + -- cgit v1.2.3