和http(超文本传输协议)
爬虫步骤:确认需求》寻找需求》发送请求》解析数据》存储数据
python爬虫初步
编码规范
#-*-coding:utf-8-*- or #coding =utf-8
python可以加入main函数来测试程序if __name__= “”__mian__“”;当函数执行时调用函数,(文件中要定义main函数)
文件中如果不定义此函数,编译器按顺序执行
注释#导入第三方模块modulebs4#网页解析re#正则表达式urllib requrets error#制订url,获取网页数据xlat#进行excel操作sqlite3#进行SQLite数据库操作
baseurl ="https://movie.douban.com/top250?start="# 链接、图片、
findlink = re.compile(r'')#创建正则表达式对象,表示规则(字符串模式)
findimg = re.compile(r' findTitle = re.compile(r'(.*)') findrating = re.compile(r' findjudege = re.compile(r'(\d*)人评价') findinq = re.compile(r'(.*)') findbd = re.compile(r' (.*?)
# re.S表示忽视换行符,包括换行符
savepath ="豆瓣电影top250.xls"
datalist = []
def main():
# getdata(baseurl)
datalist = getdata(baseurl)
saveData(datalist,savepath)
def getdata(baseurl):
for iin range(0,10):
url = baseurl+str(i*25)
html = askurl(url)
#逐一解析
soup = BeautifulSoup(html,"html.parser")
for itemin soup.find_all("div",class_="item"):
data = []
item =str(item)
# print(item)
# break
link = re.findall(findlink,item)[0]
data.append(link)
img = re.findall(findimg,item)[0]
data.append(img)
title = re.findall(findTitle,item)
if(len(title)==2):
ctitle = title[0]
data.append(ctitle)
otitle = title[1].replace("/","")
data.append(otitle)
else:
data.append(title[0])
data.append(' ')
rating = re.findall(findrating,item)[0]
data.append(rating)
jubge = re.findall(findjudege,item)[0]
data.append(jubge)
inq = re.findall(findinq,item)
if(len(inq) !=0):
inq = inq[0].replace("。","")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findbd,item)[0]
bd=re.sub('
(\s+)?'," ",bd)
bd = re.sub("/"," ",bd)
data.append(bd.strip())
datalist.append(data)
# print(datalist)
return datalist
def askurl(url):
head = {
"User-Agent":"Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/78.0.3904.108Safari/537.36"
}
html =""
try:
request = urllib.request.Request(url,headers=head)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLErroras e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData(datalist,savepath):
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)
col = ("电影链接","1","1","2","1","2","1","2")
for iin range(8):
sheet.write(0,i,col[i])
for iin range(250):
print("第%d条"%i)
data = datalist[i]
for jin range(8):
sheet.write(i+1, j, data[j])
book.save(savepath)
if __name__ =="__main__":
main()