python+beautifulsoup爬取豆瓣电影TOP250

import urllib.request
from bs4 import BeautifulSoup
import re
# 豆瓣电影top250


def __getHtml():
	data = []
	pageNum = 1
	pageSize = 0
	try:
		while(pageSize <= 225):
		# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
		# 'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
		# }
		# opener = urllib.request.build_opener()
		# opener.addheaders = [headers]
			url = "https://movie.douban.com/top250?start="+str(pageSize)+"&filter="+str(pageNum)
			#data['html%s' % i ]=urllib.request.urlopen(url).read().decode("utf-8")
			data.append(urllib.request.urlopen(url).read().decode("utf-8"))
			pageSize+=25
			pageNum+=1
			print(pageSize,pageNum)
	except Exception as e:
		raise e
	return data
def __getData(html):
	title=[]
	rating_num=[]
	range_num=[]
	data={}
	# bs4解析html
	soup = BeautifulSoup(html,"html.parser")
	for li in soup.find("ol",attrs={'class':'grid_view'}).find_all("li"):
		title.append(li.find("span",class_="title").text)
		rating_num.append(li.find("div",class_='star').find("span",class_='rating_num').text)
		range_num.append(li.find("div",class_='pic').find("em").text)
	data['title'] = title
	data['rating_num'] = rating_num
	data['range_num'] = range_num
	return data
def __getMovies(data):
	# reg = r'src="(.+?\.jpg)"'
	# imgre = re.compile(reg)
	# imglist = re.findall(imgre,html)
	# i=0
	# for pic in imglist:
	# 	urllib.request.urlretrieve(pic,"F:\pic\%s.jpg" % i)
	# 	i+=1
	f = open('F://1.html','w')
	f.write("")
	f.write("")
	f.write("")

	f.write("")
	f.write("")
	f.write("")
	f.write("")
	f.write("")
	f.write("")
	f.write("")

	f.write("")
	for data in datas:
		for i in range(0,25):
			f.write("")
			f.write("" % data['title'][i])
			f.write("" % data['rating_num'][i])
			f.write("" % data['range_num'][i])
			f.write("")
	f.write("")

	f.write("")
	f.write("
电影排名评分
%s%s%s
") f.write("") f.write("") f.close() if __name__ == '__main__': datas = [] htmls = __getHtml() for i in range(len(htmls)): data = __getData(htmls[i]) datas.append(data) #print(htmls) __getMovies(datas)


生成的html:

python+beautifulsoup爬取豆瓣电影TOP250_第1张图片


你可能感兴趣的:(Python)