突发想法,抓取六合彩数据以便采用机器学习分析练手,网页http://www.11kj.com/kj/kjNNNN.htm,NNNN为年份。
步骤如下:
1:每一个子线程抓取每一年的网页
2:抓取网页后利用正则表达式抽取数据,存入多维list。
3:构建sql语句,存入mysql。
#!user/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import threading
import re
import datetime
import pymysql
table=[]
def loadMarkSix():
threads = []
for year in range(2008,2017):
t1 = threading.Thread(target=loadYear,args=(year,))
threads.append(t1)
return threads
def loadYear(year):
print('%d start' % year)
url="http://www.11kj.com/kj/kj" + str(year) + ".htm"
html=urlopen(url)
html=html.read()
bsobj=BeautifulSoup(html)
page=bsobj.getText()
##2013/01/12
pattern='[0-9]{4,4}/[0-9]+/[0-9]+'
dateList=re.findall(pattern,page)
pattern='[0-9]+\.gif'
codeList=re.findall(pattern,str(html))
##, ##6平码+1特码,共1064个
total=int(len(codeList)/7)
##汉字.汉字.汉字.汉字.汉字
pattern='[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+'
summaryList=re.findall(pattern,page)##[猪.单数.小数.绿波.家畜]152
for i in range(0,total):
row=[]
row.append(dateList[i].replace('/','-'))##'2010-01-02'
row.append(i+1)
code=0
codeSum=0
for j in range(0,6):
code=int(re.findall('[0-9]+',codeList[6*i+j])[0])
codeSum+=code
row.append(code)
row.append(codeSum)
row.append(int(re.findall('[0-9]+',codeList[6*i+6])[0]))
info=summaryList[i].split('.')##['猴', '双数', '小数', '绿波', '野兽']
row.append(info[0])
row.append(1 if info[1]=='单数' else 0)
row.append(1 if info[2]=='大数' else 0)
row.append(1 if info[3]=='红波' else (2 if info[3]=='蓝波' else 3))
row.append(info[4])
table.append(row)
print('%d complete\n' % year)
if __name__ == '__main__':
threads=loadMarkSix()
for t in threads:
t.setDaemon(False)
t.start()
for t in threads:
t.join()
lottery =sorted(table)
sql='insert into `lottery` VALUES '
id=0
sqlvalue=''
for row in lottery:
id+=1
sqlvalue+=u"(%d,'%s',%d,%d,%d,%d,%d,%d,%d,%d,%d,'%s',%d,%d,%d,'%s')," % (id,row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14])
sql+=sqlvalue[:-1]
begin = datetime.datetime.now()
db = pymysql.connect("localhost","User","Password","marksix" )
cursor = db.cursor()
db.set_charset('utf8')##否则无法插入中文
cursor.execute(sql)
db.commit()
db.close()
end = datetime.datetime.now()
print(end-begin)
总计插入1368条记录,耗时0.01504sec
采用生成.sql文件,在mysql执行插入,耗时0.000sec
对于串成sql语句,还有另一个写法
begin = datetime.datetime.now()
sql='insert into `lottery` VALUES '
id=0
sqlvalue=''
for row in lottery:
id+=1
sqlvalue+='('
sqlvalue+=str(id)
sqlvalue+=u","
sqlvalue+=str(lottery[0])[1:-1]
sqlvalue+=u"),"
sql+=sqlvalue[:-1]
end = datetime.datetime.now()
print(end-begin)
两种字符串连接方法的时间对比:
0:00:00.004010
0:00:00.007018
后者在仅保留sqlvalue+=str(lottery[0])[1:-1]
这句时,时间为
0:00:00.004507
如采用+连接sql语句,耗时0:00:00.037025