From 86fd041f106ee8efc2c68ee8792389ebb05ae9ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 3 Jan 2019 13:59:43 -0800 Subject: remove old/redundant python CDX directory This was code from Vinay; it lives on in git history. --- cdx-record-pipeline/README.md | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 cdx-record-pipeline/README.md (limited to 'cdx-record-pipeline/README.md') diff --git a/cdx-record-pipeline/README.md b/cdx-record-pipeline/README.md deleted file mode 100644 index 797b8eb..0000000 --- a/cdx-record-pipeline/README.md +++ /dev/null @@ -1,33 +0,0 @@ -CDX Record Pipeline (GrobId Edition) -===================================== - -Hadoop based pipeline to process PDFs from a specified IA CDX dataset - -## Local mode example ## - -``` -cat -n /home/bnewbold/100k_random_gwb_pdf.cdx | ./cdx-record-pipeline.py - -``` - -## Cluster mode example ## - -``` -input=100k_random_gwb_pdf.cdx -output=100k_random_gwb_pdf.out -lines_per_map=1000 - -hadoop jar /home/webcrawl/hadoop-2/hadoop-mapreduce/hadoop-streaming.jar - -archives "hdfs://ia802400.us.archive.org:6000/lib/cdx-record-pipeline-venv.zip#cdx-record-pipeline-venv" - -D mapred.reduce.tasks=0 - -D mapred.job.name=Cdx-Record-Pipeline - -D mapreduce.job.queuename=extraction - -D mapred.line.input.format.linespermap=${lines_per_map} - -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat - -input ${input} - -output ${output} - -mapper cdx-record-pipeline.py - -file cdx-record-pipeline.py - -``` - -- cgit v1.2.3