python多线程爬虫抓取网页

突发想法,抓取六合彩数据以便采用机器学习分析练手,网页http://www.11kj.com/kj/kjNNNN.htm,NNNN为年份。
步骤如下:
1:每一个子线程抓取每一年的网页
2:抓取网页后利用正则表达式抽取数据,存入多维list。
3:构建sql语句,存入mysql。

代码如下

#!user/bin/env python3  
# -*- coding:utf-8 -*- 
from bs4 import BeautifulSoup
from urllib.request import urlopen
import threading
import re
import datetime
import pymysql

table=[]
def loadMarkSix():
    threads = []
    for year in range(2008,2017):
        t1 = threading.Thread(target=loadYear,args=(year,))
        threads.append(t1)
    return threads

def loadYear(year):
    print('%d start' % year)
    url="http://www.11kj.com/kj/kj" + str(year) + ".htm"
    html=urlopen(url)
    html=html.read()
    bsobj=BeautifulSoup(html)
    page=bsobj.getText()

    ##2013/01/12
    pattern='[0-9]{4,4}/[0-9]+/[0-9]+'
    dateList=re.findall(pattern,page)

    pattern='[0-9]+\.gif'
    codeList=re.findall(pattern,str(html))
    ##,        ##6平码+1特码,共1064个  
    total=int(len(codeList)/7)

    ##汉字.汉字.汉字.汉字.汉字
    pattern='[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+'  
    summaryList=re.findall(pattern,page)##[猪.单数.小数.绿波.家畜]152     
    for i in range(0,total):
        row=[]
        row.append(dateList[i].replace('/','-'))##'2010-01-02'
        row.append(i+1)
        code=0
        codeSum=0
        for j in range(0,6):
            code=int(re.findall('[0-9]+',codeList[6*i+j])[0])
            codeSum+=code
            row.append(code)
        row.append(codeSum)
        row.append(int(re.findall('[0-9]+',codeList[6*i+6])[0]))        
        info=summaryList[i].split('.')##['猴', '双数', '小数', '绿波', '野兽']
        row.append(info[0])
        row.append(1 if info[1]=='单数' else 0)
        row.append(1 if info[2]=='大数' else 0)
        row.append(1 if info[3]=='红波' else (2 if info[3]=='蓝波' else 3))
        row.append(info[4])
        table.append(row)
    print('%d complete\n' % year)

if __name__ == '__main__':
    threads=loadMarkSix()
    for t in threads:
        t.setDaemon(False)
        t.start()
    for t in threads:
        t.join()
    lottery =sorted(table)

    sql='insert into `lottery` VALUES '
    id=0
    sqlvalue=''
    for row in lottery:
        id+=1
        sqlvalue+=u"(%d,'%s',%d,%d,%d,%d,%d,%d,%d,%d,%d,'%s',%d,%d,%d,'%s')," % (id,row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14])
    sql+=sqlvalue[:-1]

    begin = datetime.datetime.now()
    db = pymysql.connect("localhost","User","Password","marksix" )
    cursor = db.cursor()
    db.set_charset('utf8')##否则无法插入中文
    cursor.execute(sql)
    db.commit()
    db.close()
    end = datetime.datetime.now()
    print(end-begin) 

总计插入1368条记录,耗时0.01504sec
采用生成.sql文件,在mysql执行插入,耗时0.000sec

对于串成sql语句,还有另一个写法

    begin = datetime.datetime.now()
    sql='insert into `lottery` VALUES '
    id=0
    sqlvalue=''
    for row in lottery:
        id+=1
        sqlvalue+='('
        sqlvalue+=str(id)
        sqlvalue+=u","
        sqlvalue+=str(lottery[0])[1:-1]
        sqlvalue+=u"),"
    sql+=sqlvalue[:-1]
    end = datetime.datetime.now()
    print(end-begin)    

两种字符串连接方法的时间对比:
0:00:00.004010
0:00:00.007018
后者在仅保留sqlvalue+=str(lottery[0])[1:-1]
这句时,时间为
0:00:00.004507
如采用+连接sql语句,耗时0:00:00.037025

你可能感兴趣的:(python,sql)