2014년 8월 8일 금요일

sample code to control the data in hadoop framework.

1. flume

-- fox.conf
# Name the components on this agent
# fox -> zoo -> koala
agent.sinks = koala
agent.sources = fox
agent.channels = zoo

# Describe/configure the source
agent.sources.fox.type = spooldir
agent.sources.fox.spoolDir = /home/flume/dump

# Describe the sink
agent.sinks.koala.type = hdfs
agent.sinks.koala.hdfs.path = /flume/events
agent.sinks.koala.hdfs.fileType = DataStream
agent.sinks.koala.hdfs.writeFormat = Text
agent.sinks.koala.hdfs.rollSize = 0
agent.sinks.koala.hdfs.rollCount = 10000

# Use a channel which buffers events in memory
agent.channels.zoo.type = file

# Bind the source and sink to the channel
agent.sources.fox.channels = zoo
agent.sinks.koala.channel = zoo

-- beginning with configuration (fox.conf)
shell$ flume-ng agent --conf conf --conf-file fox.conf --name agent


2. hcatalog

hcat -e "create table koala (cnt bigint, wd string)"


3. pig

a = load '/flume/events/*';
b = foreach a generate flatten(TOKENIZE((chararray)$0)) as word;
c = group b by word;
d = foreach c generate COUNT(b) as cnt, group as wd;
store d into 'koala' using org.apache.hcatalog.pig.HCatStorer();


4. hive

select wd, cnt from koala order by cnt desc limit 10;

0 개의 댓글:

댓글 쓰기