爬取双色球数据做频次分析

上代码:

import requests
from lxml import etree
import pandas as pd


def get_url(url):  # 请求url的方法,返回html
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
    }
    response = requests.get(url, headers=headers)  # 获取请求的返回数据
    response.encoding = 'utf-8'  # 定义编码,不然中文输出会乱码;
    if response.status_code == 200:  # 如果请求成功,则返回;
        return response.text
    return None


if __name__ == '__main__':
    res = pd.DataFrame(columns=('date', '1', '2', '3', '4', '5', '6', '7'))  # 创建表格指定列名
    for q in range(1, 153):  # for循环,一共153页;
        url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_%s.html' % (q)  # 定义请求的链接
        html = get_url(url)  # 请求url获取返回代码
        xpath_html = etree.HTML(html)  # xpath初始化html代码
        dates = xpath_html.xpath('//table[@class="wqhgt"]//tr//td[1]//text()')  # 获取开奖日期
        result = xpath_html.xpath('//table[@class="wqhgt"]//tr//em//text()')  # 获取上色球号
        sta = 0
        for n in range(len(result) // 7):  # 双色球7个号一组,
            if len(result) > sta + 6:
                res = res._append(pd.DataFrame(
                    {
                        'date': str(dates[n]),
                        '1': "\'" + str(result[sta]) + "\'",
                        '2': "\'" + str(result[sta + 1]) + "\'",
                        '3': "\'" + str(result[sta + 2]) + "\'",
                        '4': "\'" + str(result[sta + 3]) + "\'",
                        '5': "\'" + str(result[sta + 4]) + "\'",
                        '6': "\'" + str(result[sta + 5]) + "\'",
                        '7': "\'" + str(result[sta + 6]) + "\'",
                    }, index=[n]))  # append数据到res
                sta = sta + 7
    res.to_excel('E:\mmp\data.xlsx', index=False)  # 保存到本地excel

    # 执行频次分析
    data = pd.read_excel('E:\mmp\data.xlsx')  # 导入数据
    res = pd.DataFrame(columns=(
        '数字1', '频次1', '数字2', '频次2', '数字3', '频次3', '数字4', '频次4',
        '数字5', '频次5', '数字6', '频次6', '数字7', '频次7'))  # 创建表格指定列名
    for i in range(1, 8):
        d = data[str(i)].value_counts()
        for num, feq in enumerate(d):
            res.loc[num, '数字' + str(i)] = d.index.tolist()[num]
            res.loc[num, '频次' + str(i)] = feq
        res.to_excel('E:\mmp\data_result.xlsx', index=False)  # 保存到本地excel

结果如下:

数字1 频次1 数字2 频次2 数字3 频次3 数字4 频次4 数字5 频次5 数字6 频次6 数字7 频次7
'01' 454 '07' 214 '14' 172 '22' 169 '27' 211 '33' 381 '01' 164
'02' 371 '06' 213 '15' 165 '19' 168 '26' 210 '32' 364 '15' 156
'03' 275 '08' 188 '13' 161 '17' 168 '25' 187 '31' 292 '16' 156
'04' 238 '09' 179 '12' 155 '20' 164 '28' 174 '30' 238 '12' 155
'05' 219 '10' 170 '11' 155 '23' 158 '24' 150 '29' 227 '09' 154
'06' 184 '05' 161 '16' 144 '18' 147 '29' 150 '28' 167 '07' 151
'07' 127 '04' 144 '10' 142 '16' 134 '23' 145 '27' 151 '06' 151
'08' 117 '12' 140 '17' 141 '24' 130 '30' 145 '26' 130 '02' 150
'09' 105 '11' 134 '18' 132 '21' 130 '22' 136 '25' 92 '04' 147
'10' 67 '14' 133 '09' 122 '14' 123 '21' 121 '24' 78 '14' 144
'12' 50 '03' 119 '19' 121 '15' 119 '20' 118 '22' 55 '03' 142
'11' 47 '13' 115 '08' 107 '26' 115 '19' 108 '23' 53 '08' 142
'13' 32 '15' 86 '07' 92 '25' 102 '31' 103 '21' 33 '05' 140
'14' 19 '02' 71 '20' 92 '13' 87 '18' 87 '18' 23 '10' 137
'15' 16 '16' 66 '21' 80 '12' 68 '32' 74 '19' 21 '11' 136
'16' 14 '17' 60 '22' 78 '27' 66 '17' 63 '20' 21 '13' 131
'17' 7 '18' 54 '06' 62 '11' 60 '15' 42 '17' 13    
'21' 5 '19' 27 '24' 49 '28' 52 '16' 38 '16' 9    
'19' 3 '20' 25 '23' 46 '10' 52 '14' 32 '15' 5    
'20' 2 '21' 24 '05' 40 '29' 40 '13' 24 '11' 1    
'22' 2 '22' 15 '25' 26 '09' 30 '12' 13 '10' 1    
'18' 1 '23' 9 '04' 25 '08' 21 '11' 11 '14' 1    
'24' 1 '26' 3 '26' 19 '30' 16 '10' 7        
    '24' 3 '27' 16 '31' 15 '09' 4        
    '25' 2 '03' 8 '06' 12 '08' 2        
    '28' 1 '29' 3 '07' 8 '07' 1        
        '28' 2 '04' 1            
        '30' 1 '05' 1            

你可能感兴趣的:(AI,python)