抓取 百度TOP500 并计算增量的 小程序(特留文)

抓取 百度TOP500 并计算增量的 小程序(特留文)



#关键字
   python,pycurl, SGMLParser ,sqlite3, 抓取 , 增量计算,encodeuri 转换,crontab ,mail
#参考
  sqlite3  http://linuxgazette.net/109/chirico1.html
  SGMLParser http://www.woodpecker.org.cn/diveintopython/html_processing/index.html




提取 python
import  pycurl
from  sgmllib  import  SGMLParser
import  re
from  urllib  import  quote, unquote 

# 使用 SGMLParser(html 分析) 类继承
#
详细请查看
#
   http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class  BaiduTop_GMLParser(SGMLParser):
    
def  reset(self, verbose = 0):
        SGMLParser.reset(self)
        self.data
= []
        self.a 
=  None
        
    
def  start_a(self, attrs):
        href 
=  [v  for  k, v  in  attrs  if  k == ' href ' ]
        rsc
= re.search( ' word=(.*)\+(.*) ' , href[0] )    
        
if  href  and  rsc :
           
# baidu 页面编码为 gbk ,并且中文 encodeuri 了
            # 此转换为 utf8
            music = unquote(rsc.group( 1 )).decode( ' gbk ' ).encode( ' utf8 ' )
            actors
= unquote(rsc.group( 2 )).decode( ' gbk ' ).encode( ' utf8 ' )
            self.data.append((actors,music))
            self.a
= True
    
def  getData(self):
        
return  self.data
    
def   __init__ (self):
        self.reset()
        c 
=  pycurl.Curl()
        c.setopt(pycurl.URL, 
' http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2 ' )
        
import  StringIO
        b 
=  StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.FOLLOWLOCATION, 
1 )
        c.setopt(pycurl.MAXREDIRS, 
5 )
        
# c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
         # c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
        c.perform()
        self.feed(b.getvalue())



使用 py
# !python      
#
 -*- coding: UTF8 -*-
'''
新添加入 top  
当天全量
没有 歌手名

退出 top

drop table baidu_Top ;
create table baidu_Top (
  id integer auto_increment  PRIMARY KEY ,
  actor varchar(300) ,
  music varchar(300) ,
  createTime DATE
);

'''


import  sqlite3,os,sys
import  datetime, calendar  
import  pdb

from  baiduTop500  import  BaiduTop_GMLParser
class  Action():
    
def   __init__ (self,conn,data):
        self.conn 
=  conn 
        self.data 
=  data 
        self.allData 
=  []
        self.newData 
=  []
        self.newNotActorData
= []
        self.allNotActorData
= []

    
def  insertAll(self): 
        insertSql 
=   " insert into baidu_Top (actor,music,createTime) values (?,?,date()) ; "
        isSql 
=   " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
        isSql2 
=   " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
        cur 
=  self.conn.cursor()
        
for  actor,music  in  self.data :
            
try  :
                cur.execute(isSql,(actor,music,datetime.date.today())  )
                res 
=  cur.fetchall()
                
if   not  res :
                    cur.execute(insertSql,(actor,music)  )
                    self.allData.append( (actor,music) )
                    
if  actor == ""   or  actor == "   "  :
                        self.allNotActorData.append((actor,music))
                    conn.commit()
                todate
= (datetime.date.today() - datetime.timedelta(days = 1 ))
                cur.execute(isSql2,(actor,music,todate)  ) 
                
if   not  cur.fetchall() :
                    self.newData.append( (actor,music) )
                    
if  actor == ""   or  actor == "   "  :
                        self.newNotActorData.append((actor,music))
                    conn.commit()
            
except  Exception, myError:
                excType, excValue, traceBack 
=  sys.exc_info()
                
print  excType
                
print  myError
        
try  :
            cur.close()
        
except :
            
pass
            
    

def  pfor(title,data):
    
for  a,m  in  data :
        
print   " %s\t%s\t%s "   % (title,a,m)
def  line():
    
print
    
print   " ___________________________________________________________________________________ "
    
print   " ___________________________________________________________________________________ "
    
print  

if     __name__    ==    " __main__ " :
    
try :
        conn 
=  sqlite3.connect( " /home/xj_liukaiyi/src/python/baidu_top/ex500 " )
        ac 
=  Action(conn,BaiduTop_GMLParser().getData())
        ac.insertAll()
        
# ac.insertNewByDate()
        
        
print   ''' 说明 %s : 
            new 对比前一天新添加
            new not actor 对比前一天新增加但没歌手名
            all 当天top 500 展现全部 
            all not actor 当天 top 500 展现全部全但没歌手 
'''   % (datetime.date.today())
        line()
        pfor(
" new " ,ac.newData)
        line()
        pfor(
" new not actor " ,ac.newNotActorData)
        line()
        pfor(
" all " ,ac.allData)
        line()
        pfor(
" all not actor " ,ac.allNotActorData)
    
finally :
        conn.close()



再通过系统
crontab -e


邮箱 gbk 转码 ,后发送 。每天早上 5点
0 5 * * * /usr/local/bin/python /home/xj_liukaiyi/src/python/baidu_top/Action.py|perl -MEncode -ne 'print encode("GBK", decode("UTF-8",$_));' > tmp ; mail -s "baidu Top 500"  [email protected]  < tmp;



整理 www.blogjava.net/Good-Game

你可能感兴趣的:(抓取 百度TOP500 并计算增量的 小程序(特留文))