#-*-coding:utf8-*- import re import urllib2 #open internet def get_stock_html(url): opener = urllib2.build_opener( urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel = 0), ) opener.addheaders = [ ('User-agent', "Mozilla/4.0 (compatible;MSIE 7.0;" "Windows NT 5.1; .NET CLR 2.0.50727;" ".NET CLR 3.0.4506.2152; .NET CLR 3.5.30729") ] #url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2" response = opener.open(url) return ''.join(response.readlines()) #---------------------------------------------------------------------- def get_message(data): """get zhe message""" message_zong = re.findall('<tr bgcolor="#ffffff">(.*?)</tr>',data,re.S) for message in message_zong: message_fen = re.findall('align="center"(.*?)</td>',message,re.S) xuhao = re.search('>(.*)',message_fen[0]).group(1) bianhao = re.search('>(.*)',message_fen[1],re.S).group(1) daima = re.search('<u>(.*?)</u>',message_fen[2],re.S).group(1) classname = re.search('>(.*)',message_fen[3],re.S).group(1) xuefen = re.search('>(.*)',message_fen[4],re.S).group(1) kechengxingzhi = re.search('<u>(.*?)</u>',message_fen[5],re.S).group(1) jiaoxuebanhao =re.search('>(.*)',message_fen[6],re.S).group(1) kaikeyuanxi = re.search('<u>(.*?)</u>',message_fen[7],re.S).group(1) teacher = re.search('<u>(.*?)</u>',message_fen[8],re.S).group(1) zhicheng = re.search('>(.*)',message_fen[9],re.S).group(1) time_lianshi = re.findall('>(.*?)<br',message_fen[10],re.S) time ='' for data in time_lianshi: time = time + data.strip() youxuan = re.search('>(.*)',message_fen[11],re.S).group(1) zhuangtai = re.search('>(.*)',message_fen[12],re.S).group(1) xiaoqu = re.search('<u>(.*?)</u>',message_fen[13],re.S).group(1) didian = re.search('<u>(.*?)</u>',message_fen[14],re.S).group(1) messageni = xuhao +'\t' + bianhao +'\t' + daima +'\t' + \ classname +'\t' + xuefen +'\t' + kechengxingzhi +'\t' +\ jiaoxuebanhao +'\t' + kaikeyuanxi +'\t' + teacher +'\t' +\ zhicheng +'\t' + time +'\t' + youxuan +'\t' + \ zhuangtai +'\t' + didian +'\n' okdata.write(messageni) #read the data #f = open('data\\page1.txt','r') #data = get_stock_html() #data = unicode(data, "utf8").encode("gb2312") #f.close() okdata = open('data\\okdata1.xlsx','a+') url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2" for i in range(1,66): new_link = re.sub('jumpPage=\d+','jumpPage=%d'%i,url,re.S) html = get_stock_html(new_link) get_message(html) print 'finsh' print i okdata.close()
其中 get_stock_html是获得网页的代码
get_message 是正则表达式部分。获得有用数据
遇到的问题:
注意re.search 最后要有group才行 本段代码还有不完善尤其是对findall和search