一、以香港赛马会的赛马数据的爬取为例:
http://racing.hkjc.com/racing/Info/meeting/Results/chinese/Local/20170625/ST/2
爬取表格中的全部数据。
几个数据类:
(1)场地信息类ChangInfo,就是表格前面的那部分内容。包括班次,米,场地状况,比赛类型,赛道和钱
class ChangInfo(object):
def __init__(self,dijiban,mi,changdizhuangkuang,bisaileixing,saidao,qian):
self.dijiban = dijiban
self.mi = mi
self.changdizhuangkuang = changdizhuangkuang
self.bisaileixing = bisaileixing
self.saidao = saidao
self.qian = qian
def __init__(self):
return
def __str__(self):
return "ChangInfo:" + self.dijiban + "," + self.mi + "," + self.changdizhuangkuang + "," + self.bisaileixing + "," + self.saidao + "," + self.qian
(3)访问路径类Path类,包括每场对赛的网页地址,比赛日期,比赛标记和场次
class Path(object):
def __init__(self,url,date,flag,changci,isFinished):
self.__url = url
self.__date = date
self.__flag = flag
self.__changci = changci
self.__isFinished = isFinished
@property
def url(self):
return self.__url
@url.setter
def url(self, url):
self.__url = url
@property
def date(self):
return self.__date
@date.setter
def date(self, date):
self.__date = date
@property
def flag(self):
return self.__flag
@flag.setter
def flag(self, flag):
self.__flag = flag
@property
def changci(self):
return self.__changci
@changci.setter
def changci(self, changci):
self.__changci = changci
@property
def isFinished(self):
return self.__isFinished
@isFinished.setter
def isFinished(self, isFinished):
self.__isFinished = isFinished
def __str__(self):
return "Path:" + "url=" + self.__url + ",date=" + self.__date + ",flag=" + self.__flag + ",changci=" + str(self.__changci) + ",isFinished=" + str(self.__isFinished)
(4)Path类和MaInfo类的数据库访问对象类PathDao和MaInfoDao,其中封装了数据库的访问操作。
二、整体框架
使用Python自带的urllib来访问网页得到网页源代码。使用Python第三库BeautifulSoup进行html代码解析,得到数据。
import chenjie.url_manager
#导入自定义url管理器类
import chenjie.html_parser
#导入自定义解析器类
import chenjie.html_outputer
#导入自定义输出器
import chenjie.html_downloader
#导入自定义下载器
import chenjie.path
#导入访问路径类
import chenjie.mainfodao
#导入赛马数据数据库访问对象类
import chenjie.pathdao
#导入访问路径数据数据库访问对象类
class SpiderMain(object):
def __init__(self):
self.paths = chenjie.url_manager.UrlManager()#paths保存访问路径列表
self.downloader = chenjie.html_downloader.HtmlDownloader()#downloader为自定义下载器
self.parser = chenjie.html_parser.HtmlParser()#parser为自定义解析器
self.outputer = chenjie.html_outputer.HtmlOutputer()#output为自定义输出器
def craw(self):
'''
new_path = chenjie.path.Path("http://racing.hkjc.com/racing/Info/meeting/Results/chinese/Local/20170625/ST/2",
"20170625",
"ST",
3,
0
)
'''
paths = chenjie.pathdao.PathDao().getAllPath()
#从数据库中得到所有的访问路径
for path in paths:
#对于每一个访问路径
print(path)
new_path = chenjie.path.Path(path['url'],path['date'],path['flag'],path['changci'],path['isFinished'])
#将它封装成对象
self.paths.add_new_path(new_path)
#添加到访问路径列表中
while self.paths.has_new_path():
#当访问路径列表中还有未访问的路径时
try:
new_path = self.paths.get_new_path()
#得到一条访问路径
print ('craw : %s'%(new_path))
html_content = self.downloader.download(new_path.url)
#使用自定义网页下载器将其下载并保存
new_paths,new_data = self.parser.parse(new_path,html_content)
#使用自定义网页解析器将网页内容解析为新的访问路径和新的数据
print("main得到new_data",new_data)
self.paths.add_new_paths(new_paths)
#将新的访问路径添加到访问路径列表中
#self.outputer.collect_data(new_data)
for data in new_data:
#对于新的数据列表中的每一条数据
dao = chenjie.mainfodao.MainfoDao()
#生成一个赛马数据的自定义数据库访问对象
dao.save(data)
#调用它的save方法将新的数据保存到mysql数据库中
except Exception as e:
print("失败哒:",repr(e))
#self.outputer.output_html()
if __name__ == "__main__":
#root_url = "http://baike.baidu.com/item/Python"
obj_spider = SpiderMain()
while True:
obj_spider.craw()