2014년 4월 28일 월요일

how to use bzip2 that supports a splittable compression with hadoop-streaming package.

sample codes for testing

1. compression

# To make a compress files per block by mapper.
hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
    -D mapreduce.output.fileoutputformat.compress=TRUE \
    -D mapreduce.output.fileoutputformat.compress.type=RECORD \
    -D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
    -D mapreduce.job.reduces=0 \
    -mapper /bin/cat \
    -input <input files> \
    -output <output directory> \
#    -outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat

# To make one compress files
hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
    -D mapreduce.output.fileoutputformat.compress=TRUE \
    -D mapreduce.output.fileoutputformat.compress.type=RECORD \
    -D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
    -D mapreduce.job.reduces=1 \
    -mapper /bin/cat \
    -reducer /bin/cat \ 
    -input <input files> \
    -output <output directory> \

# To make a compress files per block by mapper into a special block size.
- Pig
set output.compression.enabled true;
set output.compression.codec org.apache.hadoop.io.compress.BZip2Codec;
A = LOAD '/lab/efdc/EFDC_2010020100.nc' USING PigStorage();
SET dfs.block.size 36700160;
STORE A INTO '/lab/Dave/output/pig.nc' USING PigStorage();

# copy a file to new file to some block size
hadoop fs -D dfs.block.size=36700160 -cp <source directory> <target directory>


2. decompression & analysis

hadoop jar hadoop-streaming-2.2.0.2.1.0.0-92.jar \
        -D mapred.input.compress=true \
        -D mapred.input.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
        -files ./mapper.py,./reducer.py -mapper ./mapper.py -reducer ./reducer.py \
        -input <bzip2 compressed files> -output <output directory>

3. ETC
# below properties was deprecated to next below.
    -D mapred.input.compress=TRUE \
    -D mapred.input.compress.type=BLOCK \
    -D mapred.input.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
# next below
    -D mapreduce.output.fileoutputformat.compress=TRUE \
    -D mapreduce.output.fileoutputformat.compress.type=BLOCK \
    -D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \

Deprecated List up ~
http://archive.cloudera.com/cdh4/cdh/4/hadoop-2.0.0-cdh4.1.2/hadoop-project-dist/hadoop-common/DeprecatedProperties.html

# block size as default
hdfs getconf -confKey dfs.blocksize

# block size as activity
hadoop fs -stat %o <hdfs drectory>