sample codes for testing
1. compression
# To make a compress files per block by mapper.
hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
-D mapreduce.output.fileoutputformat.compress=TRUE \
-D mapreduce.output.fileoutputformat.compress.type=RECORD \
-D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
-D mapreduce.job.reduces=0 \
-mapper /bin/cat \
-input <input files> \
-output <output directory> \
# -outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat
# To make one compress files
hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
-D mapreduce.output.fileoutputformat.compress=TRUE \
-D mapreduce.output.fileoutputformat.compress.type=RECORD \
-D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
-D mapreduce.job.reduces=1 \
-mapper /bin/cat \
-reducer /bin/cat \
-input <input files> \
-output <output directory> \
# To make a compress files per block by mapper into a special block size.
- Pig
set output.compression.enabled true;
set output.compression.codec org.apache.hadoop.io.compress.BZip2Codec;
A = LOAD '/lab/efdc/EFDC_2010020100.nc' USING PigStorage();
SET dfs.block.size 36700160;
STORE A INTO '/lab/Dave/output/pig.nc' USING PigStorage();
# copy a file to new file to some block size
hadoop fs -D dfs.block.size=36700160 -cp <source directory> <target directory>
2. decompression & analysis
hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
-D mapred.input.compress=true \
-D mapred.input.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
-files ./mapper.py,./reducer.py -mapper ./mapper.py -reducer ./reducer.py \
-input <bzip2 compressed files> -output <output directory>
3. ETC
# below properties was deprecated to next below.
-D mapred.input.compress=TRUE \
-D mapred.input.compress.type=BLOCK \
-D mapred.input.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
# next below
-D mapreduce.output.fileoutputformat.compress=TRUE \
-D mapreduce.output.fileoutputformat.compress.type=BLOCK \
-D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
Deprecated List up ~
http://archive.cloudera.com/cdh4/cdh/4/hadoop-2.0.0-cdh4.1.2/hadoop-project-dist/hadoop-common/DeprecatedProperties.html
# block size as default
hdfs getconf -confKey dfs.blocksize
# block size as activity
hadoop fs -stat %o <hdfs drectory>
0 개의 댓글:
댓글 쓰기