1.获取网页单页内容
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
req = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
2.解析并获取所需内容
def getData(baseurl,rule):
datalist = []
for i in range(25):
url = baseurl + str(i*25)
html = askURL(url)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("div", class_="item"):
item = str(item)
data_rule = re.findall(rule, item)
if len(data_rule)>0:
find_rule = data_rule[0]
find_rule = re.sub(r'<.*?>',' ',find_rule)
find_rule = re.sub(r'\xa0','',find_rule)
else:
find_rule = "无"
datalist.append(find_rule.strip())
print("抓取第{0}页{1}数据完成!".format(i+1,str(rule)))
return datalist
3.保存为xls
def saveData(data,savepath):
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)
col = ('电影超链接','电影图片','影片中文名','评分','评价人数','概况')
for i in range(0,len(col)):
print("写%d列" % i)
sheet.write(0,i,col[i])
for j in range(0,len(data[i])):
sheet.write(j+1,i,data[i][j])
book.save(savepath)
4.全部代码
import re
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import xlwt
import sqlite3
def main():
baseurl = "https://movie.douban.com/top250?start="
data=[]
find_rule = {
"link":r'a href="(.*?)">',
"img":r'img.*src="(.*?)"',
"title":r'span class="title">(.*?)',
"score":r'(.*?)',
"judge":r'(\d*?)人评价',
"inq":r'(.*?)'
}
for key in find_rule:
print("开始获取电影的{0}".format(str(key)))
result = re.compile(find_rule[key],re.S)
data_result = getData(baseurl,result)
data.append(data_result)
savepath = "豆瓣电影Top250.xls"
saveData(data,savepath)
def getData(baseurl,rule):
datalist = []
for i in range(25):
url = baseurl + str(i*25)
html = askURL(url)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("div", class_="item"):
item = str(item)
data_rule = re.findall(rule, item)
if len(data_rule)>0:
find_rule = data_rule[0]
find_rule = re.sub(r'<.*?>',' ',find_rule)
find_rule = re.sub(r'\xa0','',find_rule)
else:
find_rule = "无"
datalist.append(find_rule.strip())
print("抓取第{0}页{1}数据完成!".format(i+1,str(rule)))
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
req = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData(data,savepath):
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)
col = ('电影超链接','电影图片','影片中文名','评分','评价人数','概况')
for i in range(0,len(col)):
print("写%d列" % i)
sheet.write(0,i,col[i])
for j in range(0,len(data[i])):
sheet.write(j+1,i,data[i][j])
book.save(savepath)
if __name__ == '__main__':
main()
5.执行效果