<img src="https://img-bss.csdn.net/201708171721537407.gif" width="179" height="120" alt="让机器“看见”—计算机视觉原理及实战">
<p class="clearfix"> <i> ¥269.10 i>
<p><em>82em>课时(<em>已更新至82em>)<em>13em>小时<em>07em>分 p>
步骤二:
pat1 = ''
name = re.compile(pat1).findall(str(data))
name = dict(name).values() #将数据转为字典,并将value提取出来
pat2 = '(.*?)'
class_num = re.compile(pat2).findall(str(data))
# data = data.replace('\n','').replace('\t','') #将网页的换行符替换掉
pat3 = '
\s{1,}\s{1,}(.*?)\s{1,}'
price = re.compile(pat3).findall(str(data))
for i in price:
price[price.index(i)] = re.findall(r'-?\d+\.?\d*e?-?\d*?', i)
html = "https://edu.csdn.net/courses"
for n in range(1,299):
url = html+'/p'+str(n)
print(url)
data = urllib.request.urlopen(url).read().decode('utf-8') #请求网页,设置编码方式为utf-8
#将爬取数据存取为csv文件,保存在本地
def sava_data(name,class_num,price):
#创建workbook和sheet对象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1',cell_overwrite_ok=True)
#初始化excel样式
style = xlwt.XFStyle()
#为样式创建字体
font = xlwt.Font()
font.name = 'Times New Roman'
font.bold = True
#设置样式的字体
style.font = font
#在sheet1表的第1行设置字段名称并写入数据
sheet1.write(0,0,"序号",style)
sheet1.write(0,1,"课程名",style)
sheet1.write(0,2,"课时",style)
sheet1.write(0,3,"价格",style)
a=0 #定义行号初始值
for i in xx:
#print(str(a+1),i[0])
sheet1.write(a+1,0,a+1,style) #在第a+1行第1列写入序号
sheet1.write(a+1,1,name,style) #在第a+1行第2列写入课程名
sheet1.write(a+1,2,class_num,style) #在第a+1行第3列写入课时
sheet1.write(a+1,3,price,style) #在第a+1行第4列写入课程价格
a+=1
if a==a: #判断XX列表是否遍历结束
t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
t1=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
sheet1.write(a+2,1,"采集时间",style) #在sheet1表尾行写入数据采集时间
sheet1.write(a+2,2,t,style)
workbook.save("E:/csdn学院课程汇总表"+str(t1)+".xls") #保存该excel文件,有同名文件时无法直接覆盖
print("数据写入excel文件完毕!")
import urllib.request
import re,xlwt,datetime
class csdn_spider():
def __init__(self):
self.c = 0
def sava_data(self,name,class_num,price):
#创建workbook和sheet对象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1',cell_overwrite_ok=True)
#初始化excel样式
style = xlwt.XFStyle()
#为样式创建字体
font = xlwt.Font()
font.name = 'Times New Roman'
font.bold = True
#设置样式的字体
style.font = font
#在sheet1表的第1行设置字段名称并写入数据
sheet1.write(0,0,"序号",style)
sheet1.write(0,1,"课程名",style)
sheet1.write(0,2,"课时",style)
sheet1.write(0,3,"价格",style)
a=0 #定义行号初始值
for i in range(0,self.c-1):
#print(str(a+1),i[0])
sheet1.write(a+1,0,a+1,style) #在第a+1行第1列写入序号
sheet1.write(a+1,1,name[i],style) #在第a+1行第2列写入课程名
sheet1.write(a+1,2,class_num[i],style) #在第a+1行第3列写入课时
sheet1.write(a+1,3,price[i],style) #在第a+1行第4列写入课程价格
a+=1
if a==a: #判断XX列表是否遍历结束
t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
t1=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
sheet1.write(a+2,1,"采集时间",style) #在sheet1表尾行写入数据采集时间
sheet1.write(a+2,2,t,style)
workbook.save("E:/csdn学院课程汇总表"+str(t1)+".xls") #保存该excel文件,有同名文件时无法直接覆盖
print("数据写入excel文件完毕!")
def data(self):
html = "https://edu.csdn.net/courses"
name = []
class_num = []
price = []
for n in range(1,299):
url = html+'/p'+str(n)
print(url)
data = urllib.request.urlopen(url).read().decode('utf-8') #请求网页,设置编码方式为utf-8
#print(data)
pat1 = ''
n = re.compile(pat1).findall(str(data))
n = list(dict(n).values()) #将数据转为字典,并将value提取出来
name = name+n
pat2 = '(.*?)'
class_num += re.compile(pat2).findall(str(data))
# data = data.replace('\n','').replace('\t','') #将网页的换行符替换掉
pat3 = '
\s{1,}\s{1,}(.*?)\s{1,}'
p = re.compile(pat3).findall(str(data))
for i in p:
p[p.index(i)] = re.findall(r'-?\d+\.?\d*e?-?\d*?', i)
price = price+p
print(name,class_num,price)
self.c = len(class_num)
print(self.c,list(name),class_num,price)
self.sava_data(list(name),class_num,price)
if __name__ == '__main__':
saveinfo = csdn_spider() #调用类
save_res = saveinfo.data()