Python爬虫系列之双色球开奖信息爬取

Python基于Beautiful Soup库对双色球开奖信息进行爬取

代码仅供学习交流,请勿用于非法用途

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

对数据进行爬取,存储至execl表

  • 以下为代码部分,具体步骤已注释
  • 代码多有不足,望谅解,欢迎私信交流
import requests
from bs4 import BeautifulSoup
import time
import xlrd
import xlwt
from xlutils.copy import copy
'''
    @Author:王磊
    @Time  :2018/11/29 15:30:25
'''


def getHTML(url):
    '''
    通过url以get方式请求获取响应数据
    :param url:
    :return: html/str
    '''
    try:
        res = requests.get(url)
        return res.content.decode(res.apparent_encoding, 'ignore')
    except Exception as e:
        pass


def getPages(html):
    '''
    获取页面总数
    :param html:
    :return: nums/int
    '''
    soup = BeautifulSoup(html, 'html.parser')
    nums = int(soup.findAll('p')[1].find('strong').get_text())
    return nums


def initExcel():
    '''
    初始化excel表
    :return: filePath/url
    '''
    f = xlwt.Workbook()
    sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
    row0 = [u'开奖日期', u'期号', u'中奖号码', u'销售额(元)	', u'一等奖人数', u'一等奖遍布地区', u'二等奖人数', u'中奖详情地址', u'中奖视频地址']
    for i in range(0, len(row0)):
        sheet1.write(0, i, row0[i])
    f.save('c:/Users/asus/Desktop/pc/text/双色球.xlsx')
    return 'c:/Users/asus/Desktop/pc/text/双色球.xlsx'


def writeExcel(path, data):
    '''
    将数据追加写入excel
    :param path:
    :param data:
    :return:
    '''
    workbook = xlrd.open_workbook(path)
    sheets = workbook.sheet_names()
    worksheet = workbook.sheet_by_name(sheets[0])
    rows_old = worksheet.nrows
    new_workbook = copy(workbook)
    new_worksheet = new_workbook.get_sheet(0)
    raws = len(data)
    for _ in range(raws):
        for j in range(0, len(data[_])):
            try:
                new_worksheet.write(_ + rows_old, j, data[_][j])  # 追加写入数据,注意是从i+rows_old行开始写入
            except Exception as e:
                continue
    new_workbook.save(path)


def parseData(html):
    '''
    解析页面数据,获取目标数据
    :param html: html页面数据
    :return: res/list
    '''
    soup = BeautifulSoup(html, 'html.parser')
    trs = soup.findAll('tr')[2:-1]
    leng = len(trs)
    res = []
    for i in range(leng):
        res0 = []
        tds = trs[i].findAll('td')
        pFir = tds[4].get_text().replace(" ", "").split("\n")
        res0.append(tds[0].get_text())
        res0.append(tds[1].get_text())
        res0.append(str(tds[2].get_text()).replace("\n", ""))
        res0.append(tds[3].get_text())
        res0.append(pFir[0])
        res0.append(pFir[1])
        res0.append(tds[5].get_text())
        res0.append(tds[6].findAll('a')[0]['href'])
        res0.append(tds[6].findAll('a')[1]['href'])
        res.append(res0)
    return res


def getUrl(n):
    '''
    通过页索引获取页面地址
    :param n:
    :return: url/str
    '''
    return 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(n) + '.html'


def main():
    '''
    入口函数
    :return: None
    '''
    url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
    html = getHTML(url)
    # 获取总页数
    pageNo = getPages(html)
    # 初始化excel表
    path = initExcel()
    # 依次处理每一页数据
    for _ in range(1, pageNo + 1):
        print("正在爬取第%d页数据..." % _)
        url = getUrl(_)
        html = getHTML(url)
        # 解析数据
        excelData = parseData(html)
        # 追加存储数据
        writeExcel(path, excelData)
        print("第%d页数据爬取完成!" % _)
        time.sleep(3)
    print("爬取完成!")


if __name__ == '__main__':
    main()

☞点击这里与我探讨☚

♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子,欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪

你可能感兴趣的:(Python)