爬取双色球历史数据
爬去地址:http://baidu.lecai.com/lottery/draw/list/50
需要的包:BeautifulSoup install
脚本如下 运行后生成的数据json处理后存在data文件中
# -*- coding: utf-8 -*- import urllib import re import json from bs4 import BeautifulSoup li = [] for year in range(2003, 2015): print year htmlcon = urllib.urlopen('http://baidu.lecai.com/lottery/draw/list/50?d=%s-01-01' % year) html = htmlcon.read() htmlcon.close() soup = BeautifulSoup(html) table_html_set = soup.findAll(id='draw_list') num_tuple_list = [] for table_html in table_html_set: tr_html_set = table_html.findAll('tr') for tr_html in tr_html_set: span_html_set = tr_html.findAll('span', attrs={'class': re.compile('^ball_')}) num_tuple = tuple([int(x.text) for x in span_html_set]) if num_tuple: num_tuple_list.append(num_tuple) print "count: %s" % len(num_tuple_list) li.extend(num_tuple_list) fl = open('data', 'w') fl.write(json.dumps(li)) fl.close()
import json try: from IPython import embed as stop except ImportError: from pdb import set_trace as stop fl = open('data') li_json = fl.read() fl.close() li = json.loads(li_json) li = [tuple(x) for x in li] li.sort(lambda x,y:cmp(x,y)) fl = open('ticket.txt', 'w') for item in li: line = ",".join([str(x) for x in item]) fl.writelines("%s\n" % line) fl.close()