抓取 百度TOP500 并计算增量的 小程序(特留文)
#关键字
python,pycurl, SGMLParser ,sqlite3, 抓取 , 增量计算,encodeuri 转换,crontab ,mail
#参考
sqlite3 http://linuxgazette.net/109/chirico1.html
SGMLParser http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
提取 python
import
pycurl
from sgmllib import SGMLParser
import re
from urllib import quote, unquote
# 使用 SGMLParser(html 分析) 类继承
# 详细请查看
# http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class BaiduTop_GMLParser(SGMLParser):
def reset(self, verbose = 0):
SGMLParser.reset(self)
self.data = []
self.a = None
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
rsc = re.search( ' word=(.*)\+(.*) ' , href[0] )
if href and rsc :
# baidu 页面编码为 gbk ,并且中文 encodeuri 了
# 此转换为 utf8
music = unquote(rsc.group( 1 )).decode( ' gbk ' ).encode( ' utf8 ' )
actors = unquote(rsc.group( 2 )).decode( ' gbk ' ).encode( ' utf8 ' )
self.data.append((actors,music))
self.a = True
def getData(self):
return self.data
def __init__ (self):
self.reset()
c = pycurl.Curl()
c.setopt(pycurl.URL, ' http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2 ' )
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1 )
c.setopt(pycurl.MAXREDIRS, 5 )
# c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
# c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
c.perform()
self.feed(b.getvalue())
from sgmllib import SGMLParser
import re
from urllib import quote, unquote
# 使用 SGMLParser(html 分析) 类继承
# 详细请查看
# http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class BaiduTop_GMLParser(SGMLParser):
def reset(self, verbose = 0):
SGMLParser.reset(self)
self.data = []
self.a = None
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
rsc = re.search( ' word=(.*)\+(.*) ' , href[0] )
if href and rsc :
# baidu 页面编码为 gbk ,并且中文 encodeuri 了
# 此转换为 utf8
music = unquote(rsc.group( 1 )).decode( ' gbk ' ).encode( ' utf8 ' )
actors = unquote(rsc.group( 2 )).decode( ' gbk ' ).encode( ' utf8 ' )
self.data.append((actors,music))
self.a = True
def getData(self):
return self.data
def __init__ (self):
self.reset()
c = pycurl.Curl()
c.setopt(pycurl.URL, ' http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2 ' )
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1 )
c.setopt(pycurl.MAXREDIRS, 5 )
# c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
# c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
c.perform()
self.feed(b.getvalue())
使用 py
#
!python
# -*- coding: UTF8 -*-
'''
新添加入 top
当天全量
没有 歌手名
退出 top
drop table baidu_Top ;
create table baidu_Top (
id integer auto_increment PRIMARY KEY ,
actor varchar(300) ,
music varchar(300) ,
createTime DATE
);
'''
import sqlite3,os,sys
import datetime, calendar
import pdb
from baiduTop500 import BaiduTop_GMLParser
class Action():
def __init__ (self,conn,data):
self.conn = conn
self.data = data
self.allData = []
self.newData = []
self.newNotActorData = []
self.allNotActorData = []
def insertAll(self):
insertSql = " insert into baidu_Top (actor,music,createTime) values (?,?,date()) ; "
isSql = " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
isSql2 = " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
cur = self.conn.cursor()
for actor,music in self.data :
try :
cur.execute(isSql,(actor,music,datetime.date.today()) )
res = cur.fetchall()
if not res :
cur.execute(insertSql,(actor,music) )
self.allData.append( (actor,music) )
if actor == "" or actor == " " :
self.allNotActorData.append((actor,music))
conn.commit()
todate = (datetime.date.today() - datetime.timedelta(days = 1 ))
cur.execute(isSql2,(actor,music,todate) )
if not cur.fetchall() :
self.newData.append( (actor,music) )
if actor == "" or actor == " " :
self.newNotActorData.append((actor,music))
conn.commit()
except Exception, myError:
excType, excValue, traceBack = sys.exc_info()
print excType
print myError
try :
cur.close()
except :
pass
def pfor(title,data):
for a,m in data :
print " %s\t%s\t%s " % (title,a,m)
def line():
print
print " ___________________________________________________________________________________ "
print " ___________________________________________________________________________________ "
print
if __name__ == " __main__ " :
try :
conn = sqlite3.connect( " /home/xj_liukaiyi/src/python/baidu_top/ex500 " )
ac = Action(conn,BaiduTop_GMLParser().getData())
ac.insertAll()
# ac.insertNewByDate()
print ''' 说明 %s :
new 对比前一天新添加
new not actor 对比前一天新增加但没歌手名
all 当天top 500 展现全部
all not actor 当天 top 500 展现全部全但没歌手 ''' % (datetime.date.today())
line()
pfor( " new " ,ac.newData)
line()
pfor( " new not actor " ,ac.newNotActorData)
line()
pfor( " all " ,ac.allData)
line()
pfor( " all not actor " ,ac.allNotActorData)
finally :
conn.close()
# -*- coding: UTF8 -*-
'''
新添加入 top
当天全量
没有 歌手名
退出 top
drop table baidu_Top ;
create table baidu_Top (
id integer auto_increment PRIMARY KEY ,
actor varchar(300) ,
music varchar(300) ,
createTime DATE
);
'''
import sqlite3,os,sys
import datetime, calendar
import pdb
from baiduTop500 import BaiduTop_GMLParser
class Action():
def __init__ (self,conn,data):
self.conn = conn
self.data = data
self.allData = []
self.newData = []
self.newNotActorData = []
self.allNotActorData = []
def insertAll(self):
insertSql = " insert into baidu_Top (actor,music,createTime) values (?,?,date()) ; "
isSql = " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
isSql2 = " select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ; "
cur = self.conn.cursor()
for actor,music in self.data :
try :
cur.execute(isSql,(actor,music,datetime.date.today()) )
res = cur.fetchall()
if not res :
cur.execute(insertSql,(actor,music) )
self.allData.append( (actor,music) )
if actor == "" or actor == " " :
self.allNotActorData.append((actor,music))
conn.commit()
todate = (datetime.date.today() - datetime.timedelta(days = 1 ))
cur.execute(isSql2,(actor,music,todate) )
if not cur.fetchall() :
self.newData.append( (actor,music) )
if actor == "" or actor == " " :
self.newNotActorData.append((actor,music))
conn.commit()
except Exception, myError:
excType, excValue, traceBack = sys.exc_info()
print excType
print myError
try :
cur.close()
except :
pass
def pfor(title,data):
for a,m in data :
print " %s\t%s\t%s " % (title,a,m)
def line():
print " ___________________________________________________________________________________ "
print " ___________________________________________________________________________________ "
if __name__ == " __main__ " :
try :
conn = sqlite3.connect( " /home/xj_liukaiyi/src/python/baidu_top/ex500 " )
ac = Action(conn,BaiduTop_GMLParser().getData())
ac.insertAll()
# ac.insertNewByDate()
print ''' 说明 %s :
new 对比前一天新添加
new not actor 对比前一天新增加但没歌手名
all 当天top 500 展现全部
all not actor 当天 top 500 展现全部全但没歌手 ''' % (datetime.date.today())
line()
pfor( " new " ,ac.newData)
line()
pfor( " new not actor " ,ac.newNotActorData)
line()
pfor( " all " ,ac.allData)
line()
pfor( " all not actor " ,ac.allNotActorData)
finally :
conn.close()
再通过系统
crontab -e
邮箱 gbk 转码 ,后发送 。每天早上 5点
0 5 * * * /usr/local/bin/python /home/xj_liukaiyi/src/python/baidu_top/Action.py|perl -MEncode -ne 'print encode("GBK", decode("UTF-8",$_));' > tmp ; mail -s "baidu Top 500" [email protected] < tmp;
整理 www.blogjava.net/Good-Game