2014년 8월 8일 금요일

sample code for collecting a data.


1. data collection using python

# python library for pulling data out of html or xml
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.html
-- pulling_data.py
import codecs
import urllib2
from bs4 import BeautifulSoup

f = urllib2.urlopen('http://www.daum.net')
html_doc = f.read()

soup = BeautifulSoup(html_doc)
# for hangul
with codecs.open('result_daum.txt','w',encoding='utf8') as f:
        for str in soup.body.strings :
                f.write(str)

# soup.string get on all.
# soup.body.string get on a descendant of body.


2.ingest data and count each words and throw result to postgest

import codecs
import urllib2
from bs4 import BeautifulSoup

# get a site page
site = 'http://www.auction.co.kr'

f = urllib2.urlopen(site)
html_doc = f.read()

result = []

soup = BeautifulSoup(html_doc)

# get the level 2 deep's pages
for link in soup.find_all('a'):
        link_tmp = link.get('href')
        try:
                f = urllib2.urlopen(link_tmp)
                html_doc = f.read()
                soup = BeautifulSoup(html_doc)
                for str in soup.body.strings:
                        result.append(str)
        except:
                pass

# would be count as unique word
wordcount={}

for line in result:
        for word in line.split():
                if word not in wordcount:
                        wordcount[word] = 1
                else:
                        wordcount[word] += 1

f.close()

#with codecs.open('get.txt','w',encoding='utf8') as f:
#        for word,cnt in wordcount.items():
#                f.write("%s     %d\n" % (word,cnt))

# throwing the result to postgresql (ant)
import psycopg2

try:
    conn = psycopg2.connect("dbname='ant' user='ant' host='zoo' password='ant'")
except:
    print "I am unable to connect to the database"

cur = conn.cursor()

for word, cnt in wordcount.items():
        cur.execute("INSERT INTO commerce(tm,site,lev,word,cnt) VALUES (now(),%s,2,%s, %s)", (site,word,cnt,) )

conn.commit()
conn.close()


3. using Twitter API with python tweepy library
import tweepy

consumer_key = '0EBFhXXXXXXXgcG9ouIGZ6l'
consumer_secret = 'spmIzBLO24MqEXXXXXXXXXXXX35K4FyUlLoAw'

access_token = '151351809-REDL20AXXXXXXXXXXXXXXGhsDRCjd9Y0jtrDH'
access_token_secret = 'lawyXXXXXXXXXXXXXXXXXXXlon0YmtwZTd'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()
for tweet in public_tweets:
    print tweet.text

0 개의 댓글:

댓글 쓰기