代码仅供学习交流,请勿用于非法用途
import requests
from bs4 import BeautifulSoup
import time
import xlrd
import xlwt
from xlutils.copy import copy
'''
@Author:王磊
@Time :2018/11/29 15:30:25
'''
def getHTML(url):
'''
通过url以get方式请求获取响应数据
:param url:
:return: html/str
'''
try:
res = requests.get(url)
return res.content.decode(res.apparent_encoding, 'ignore')
except Exception as e:
pass
def getPages(html):
'''
获取页面总数
:param html:
:return: nums/int
'''
soup = BeautifulSoup(html, 'html.parser')
nums = int(soup.findAll('p')[1].find('strong').get_text())
return nums
def initExcel():
'''
初始化excel表
:return: filePath/url
'''
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
row0 = [u'开奖日期', u'期号', u'中奖号码', u'销售额(元) ', u'一等奖人数', u'一等奖遍布地区', u'二等奖人数', u'中奖详情地址', u'中奖视频地址']
for i in range(0, len(row0)):
sheet1.write(0, i, row0[i])
f.save('c:/Users/asus/Desktop/pc/text/双色球.xlsx')
return 'c:/Users/asus/Desktop/pc/text/双色球.xlsx'
def writeExcel(path, data):
'''
将数据追加写入excel
:param path:
:param data:
:return:
'''
workbook = xlrd.open_workbook(path)
sheets = workbook.sheet_names()
worksheet = workbook.sheet_by_name(sheets[0])
rows_old = worksheet.nrows
new_workbook = copy(workbook)
new_worksheet = new_workbook.get_sheet(0)
raws = len(data)
for _ in range(raws):
for j in range(0, len(data[_])):
try:
new_worksheet.write(_ + rows_old, j, data[_][j]) # 追加写入数据,注意是从i+rows_old行开始写入
except Exception as e:
continue
new_workbook.save(path)
def parseData(html):
'''
解析页面数据,获取目标数据
:param html: html页面数据
:return: res/list
'''
soup = BeautifulSoup(html, 'html.parser')
trs = soup.findAll('tr')[2:-1]
leng = len(trs)
res = []
for i in range(leng):
res0 = []
tds = trs[i].findAll('td')
pFir = tds[4].get_text().replace(" ", "").split("\n")
res0.append(tds[0].get_text())
res0.append(tds[1].get_text())
res0.append(str(tds[2].get_text()).replace("\n", ""))
res0.append(tds[3].get_text())
res0.append(pFir[0])
res0.append(pFir[1])
res0.append(tds[5].get_text())
res0.append(tds[6].findAll('a')[0]['href'])
res0.append(tds[6].findAll('a')[1]['href'])
res.append(res0)
return res
def getUrl(n):
'''
通过页索引获取页面地址
:param n:
:return: url/str
'''
return 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(n) + '.html'
def main():
'''
入口函数
:return: None
'''
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
html = getHTML(url)
# 获取总页数
pageNo = getPages(html)
# 初始化excel表
path = initExcel()
# 依次处理每一页数据
for _ in range(1, pageNo + 1):
print("正在爬取第%d页数据..." % _)
url = getUrl(_)
html = getHTML(url)
# 解析数据
excelData = parseData(html)
# 追加存储数据
writeExcel(path, excelData)
print("第%d页数据爬取完成!" % _)
time.sleep(3)
print("爬取完成!")
if __name__ == '__main__':
main()
☞点击这里与我探讨☚
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子,欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪