Python爬取双色球数据进行分析

突发奇想,是否可以从双色球的历史信息中分析出现次数最多的组合,那就从1+1的组合开始分析,嘿嘿,经常中这种组合也挺不错哦!现在我们开始获取数据,应该从中彩网的爬取数据比较靠谱,下面是脚本。

# -*- coding:utf-8 -*-
import requests, bs4
import os, time
import operator
from itertools import combinations, permutations

class DoubleColorBall(object):
    def __init__(self):
        self.balls = {}
        self.baseUrl = 'http://tubiao.zhcw.com/tubiao/ssqNew/ssqJsp/ssqZongHeFengBuTuAsc.jsp'
        self.dataFile = 'd:\\balls_data.txt'

    def getHtml(self, url):
        headers = {
            'Referer':'http://tubiao.zhcw.com/tubiao/ssqNew/ssqInc/ssqZongHeFengBuTuAsckj_year=2016.html',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'    
        }
        self.session = requests.Session()
        response = self.session.get(url, headers=headers)
        return response.text

    def getBall(self):
        for year in range(2003, 2018):
            url = self.baseUrl + '?kj_year=%s' % (year, )
            print(url)
            html = self.getHtml(url)
            self.bs = bs4.BeautifulSoup(html, 'html.parser')
            if self.bs:
                data = self.bs.find_all(class_='hgt')
                self.parseBall(data)

    def parseBall(self, data):
        self.balls = {}
        for row in data:
            if not isinstance(row, bs4.element.Tag):
                continue
            center = row.find(class_="qh7").string.strip()
            print(center)
            if center.startswith("模拟"):
                break
            redBalls = row.find_all(class_="redqiu")
            blueBall = row.find(class_="blueqiu3").string.strip()
            self.balls[center] = [r.string for r in redBalls] + [blueBall]

        self.saveBall(self.balls)


    def saveBall(self, data):
        with open(self.dataFile, 'a+') as f:
            for r in sorted(data,reverse=True):
                f.write(str(r) + ' ' + ' '.join(data[r]) + '\n')



class Analysis(object):
    def __init__(self):
        self.redrst = {}
        self.bluerst = {}
        self.redbluerst = {}

    def run(self):
        with open('d:\\balls_data.txt', 'r') as f:
            for r in f.readlines():
                rList = r.split(' ')
                redBalls = rList[1:-1]
                blueBall = rList[-1]
                for i in combinations(redBalls, 1):                     
                    if self.redbluerst.get('-'.join(i)+'-'+str(blueBall)):
                        self.redbluerst['-'.join(i)+'-'+str(blueBall)] += 1
                    else:
                        self.redbluerst['-'.join(i)+'-'+str(blueBall)] = 1  
            sorted_redblue=sorted(self.redbluerst.items(),key=operator.itemgetter(1))
            print(sorted_redblue[-3:])

if __name__ == '__main__':
    ball = DoubleColorBall()
    ball.getBall()
    anal =Analysis()
    anal.run()

输出结果:
[(‘20-9\n’, 38), (‘8-16\n’, 39), (‘1-12\n’, 40)]

总共爬取了2184期的数据,结果1+1组合中最多出现的也只有40次,概率为0.018。O(∩_∩)O哈哈~

你可能感兴趣的:(Python,数据分析)