2014년 7월 15일 화요일

mongoDB usable scripts

# get an average of collection with some conditions.
db.POINT_TOTAL_OBS_STATION_DATA.group(
   { cond: { obs_item_id : "OBSCD00074" }
   , initial: {count: 0, total:0}
   , reduce: function(doc, out) { out.count++ ; out.total += doc.v1 }
   , finalize: function(out) { out.avg = out.total / out.count }

} )

# improved a performance , however I don't know exactly why... may be hash !!
db.POINT_TOTAL_OBS_STATION_DATA.aggregate( [ 
     { $match: { obs_item_id : "OBSCD00074" } }, 
     { $group: { _id : 0 , v1_avg : { $avg: "$v1"} } } ] )


# group by each values
db.POINT_TOTAL_OBS_STATION_DATA.aggregate( [ 
     { $group: { _id : { key : "$obs_item_id" },  v1_avg : { $avg: "$v1"} } } ] )


# join query for a special case
db.POINT_TOTAL_OBS_STATION_DATA.aggregate( [{ $group: { _id : "$obs_item_id"} } , { $out : "fox_out" } ] )
fox = db.fox_out.find().toArray()
for ( var i = 0 ; i < fox.length ; i ++ ) {   db.fox_result.insert (db.OBS_ITEM_CODE.find( { obs_item_id : fox[i]._id }, { obs_item_id : 1, item_name_kor : 1 } ).toArray() ) }
db.fox_result.find().sort( { item_name_kor : 1 } )


# to find some for
db.POINT_TOTAL_OBS_STATION_DATA.aggregate( [
     { $match: { tm : { $gte : '2011-01-01 00:00:00', $lt : '2012-01-01 00:00:00' } }},
     { $group: { _id : 0 , v1_avg : { $avg: "$v1"} } } ] )


# Create index
db.POINT_TOTAL_OBS_STATION_DATA.ensureIndex( { obs_item_id : 1 } )
db.POINT_TOTAL_OBS_STATION_DATA.ensureIndex( { obs_time : 1 } )
.. more
db.system.indexes.find()

# how to check a elapsed time
db.setProfilingLevel(0) -- disable
db.setProfilingLevel(1) -- enable to 1 level
db.system.profile.find().limit(10).sort( { ts : -1 } ).pretty()


2014년 7월 7일 월요일

R sample code for

# R enviroments
-- to change java heap size
-- R_HOME/etc/Rprofile.site
options(java.parameters = c("-Xmx16g","-Dfile.encoding=UTF-8"))
-- to read UTF-8 type.
f <- file("d:/parser.txt", blocking=F,encoding="UTF-8")
txtLines <- readLines(f)


# 1. how to collect a stock info.

install.packages("fImport")
library(fImport)
s_e <- yahooSeries("005935.KS")
plot(s_e)


# 2. gathering a twitter posts.

install.packages("twitteR");install.packages("tm");
install.packages("wordcloud")
library(twitteR)
library(ROAuth)
library(RCurl)
library(tm)
library(wordcloud)
library(RColorBrewer)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
consumerKey <- "EOpIQtUgZmLGX04G1BYGIPDSE"
consumerSecret <- "N1OtR9YfulAL9OMjtN51XrAQ7DlnrjarfZo0Y6YJn11MOrfnXf"
twitCred <- OAuthFactory$new(consumerKey=consumerKey,consumerSecret = consumerSecret, requestURL = reqURL, accessURL = accessURL, authURL = authURL)
download.file(url = "http://curl.haxx.se/ca/cacert.pem", destfile = "cacert.pem")
twitCred$handshake(cainfo = "cacert.pem")
save(list='twitCred',file="twitteR_credentials")

registerTwitterOAuth(twitCred)
hilton.tweets <- searchTwitter('@ford',n=90,cainfo='cacert.pem')
hilton.tweets

install.packages("plyr")
library(plyr)
hilton.tweets[1]
hilton.txt <- laply(hilton.tweets,function(t) t$getText())
hilton<-as.data.frame(hilton.txt)


# 3. network analysis

install.packages('igraph')
library(igraph)
ga.data <- read.csv('http://www.babelgraph.org/data/ga_edgelist.csv', header=TRUE)
str(ga.data)
head(ga.data)
g1 <- graph.data.frame(ga.data, directed=FALSE)
summary(g1)
str(g1)
V(g1)
E(g1)

set.seed(2020)
plot(g1)


# 4. word cloud

library(NLP)
library(tm)
library(SnowballC)
# source directory
lords <- Corpus (DirSource("/Users/Mark/Desktop/AnalysisR/tmp01"))
inspect(lords)
lords <- tm_map(lords, stripWhitespace)
lords <- tm_map(lords, PlainTextDocument)
lords <- tm_map(lords, removeWords, stopwords("english"))
lords <- tm_map(lords, stemDocument)
wordcloud(lords, scale=c(5,0.5), max.words=100, random.order=FALSE,
          rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
# control cloud graph
lords <- tm_map(lords, removeWords, c("textbook","book"))


# 5. Loading image and extract into RGB

# install.packages("jpeg")
library(jpeg)

# read jpg image to matrix
slide1 <- readJPEG("brazil.jpg")
# matrix scale (dimension)
dim(slide1)

library(raster)
library(grid)

# extract to each RGB
rst.blue <- raster(slide1[,,1])
rst.green <- raster(slide1[,,2])
rst.red <- raster(slide1[,,3])
# translation to matrix
fox.blue <- as.matrix(rst.blue)
fox.green <- as.matrix(rst.green)
fox.red <- as.matrix(rst.red)
# drawing
grid.raster(slide1)
grid.raster(fox.blue)


# 6. linear and integer Programming
# - we found a feasible integer point for.

install.packages("lpSolve",dependencies=T)
library(lpSolve)

# weight of elements (A,B,C)
f.obj <- c(1,2,1)
f.con <- matrix (c(1, 1, 1,
                   1, 0, 0,
                   1, 1, 0,
                   0, 1, 0,
                   0, 0, 1), nrow=5, byrow=T)

f.dir <- c( "<=", ">=", "<=", ">=",">=")
f.rhs <- c(90,30,75,10,20)

lp ("max", f.obj, f.con, f.dir, f.rhs)
lp ("max", f.obj, f.con, f.dir, f.rhs)$solution


# 7. get the google map

library(ggmap)
# central park
mapImageData1 <- get_map(location = c(lon = -73.960987, lat = 40.783598),
                         color = c("color"),source = "google",
                         maptype = c("satellite"),zoom = 14)

ggmap(mapImageData1,
      extent = "device",
      ylab = "Latitude",
      xlab = "Longitude")


# 8. using to mongoDB

library(rmongodb) - detail
...
library(RMongo) - easy
mongo <- mongoDbConnect("zoo", "192.168.50.81", 27017)
output <- dbInsertDocument(mongo, "fox_r", '{"foo": "fox","type":"anymal"}')
output <- dbGetQuery(mongo, "fox_r",'{"foo": "bar"}')
print (output)

# 9. connect to PostgreSQL

library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname="elephant",host="192.168.50.82",port=5432,user="fox",password="pw")
koala <- dbGetQuery(con,"select * from koala")
...
# converts to UTF-8 for hangul

Encoding(koala$contents) <- "UTF-8"


# 10. extract noun included hangul

library(KoNLP)
library(RColorBrewer)
library(wordcloud)
f <- file("d:/parser.txt", blocking=F)

txtLines <- readLines(f)
Encoding(txtLines) <- "UTF-8"

nouns <- sapply(txtLines, extractNoun, USE.NAMES=F)
close(f)
wordcount <- table(unlist(nouns))
pal <- brewer.pal(12,"Set3")
pal <- pal[-c(1:2)]
wordcloud(names(wordcount),freq=wordcount,scale=c(6,0.3),min.freq=40,
          random.order=T,rot.per=.1,colors=pal)