爬取双色球历史数据

爬取双色球历史数据

爬去地址:http://baidu.lecai.com/lottery/draw/list/50

需要的包:BeautifulSoup install

脚本如下 运行后生成的数据json处理后存在data文件中

# -*- coding: utf-8 -*-
import urllib
import re
import json
from bs4 import BeautifulSoup


li = []

for year in range(2003, 2015):
    print year
    htmlcon = urllib.urlopen('http://baidu.lecai.com/lottery/draw/list/50?d=%s-01-01' % year)
    html = htmlcon.read()
    htmlcon.close()

    soup = BeautifulSoup(html)

    table_html_set = soup.findAll(id='draw_list')

    num_tuple_list = []
    for table_html in table_html_set:
        tr_html_set = table_html.findAll('tr')

        for tr_html in tr_html_set:
            span_html_set = tr_html.findAll('span', attrs={'class': re.compile('^ball_')})
            num_tuple = tuple([int(x.text) for x in span_html_set])
            if num_tuple:
                num_tuple_list.append(num_tuple)
    print "count: %s" % len(num_tuple_list)

    li.extend(num_tuple_list)


fl = open('data', 'w')
fl.write(json.dumps(li))
fl.close()

取data数据的脚本 排序处理存入ticket.txt 可以用stop断点分析数据用 省的每次都用脚本跑

import json
try:
    from IPython import embed as stop
except ImportError:
    from pdb import set_trace as stop

fl = open('data')
li_json = fl.read()
fl.close()
li = json.loads(li_json)
li = [tuple(x) for x in li]


li.sort(lambda x,y:cmp(x,y))


fl = open('ticket.txt', 'w')

for item in li:
    line = ",".join([str(x) for x in item])
    fl.writelines("%s\n" % line)

fl.close()



你可能感兴趣的:(爬虫,脚本)