在实例三的基础上进行加强,当一个网页抓取好后,抓取全部网页也就非常容易了。
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_one_page(x):
#字符串的格式化处理: {}占位符表示未知的参数,后面会补上
url = 'https://maoyan.com/board/4?offset={}'.format(x*10)
#第二种方法:url = 'https://maoyan.com/board/4?offset=%d'%(x*10)
response = urlopen(url)
return (response.read().decode())
def get_film(html):
ls = []
#html = get_one_page(x)
soup = BeautifulSoup(html,'html.parser')
class_name = soup.select('.name')
class_star = soup.select('.star')
class_releasetime = soup.select('.releasetime')
class_integer = soup.select('.integer')
class_fraction = soup.select('.fraction')
#print(class_name,class_star,class_releasetime,class_integer,class_fraction)
for a,b,c,d,e in zip(class_name,class_star,class_releasetime,class_integer,class_fraction):
'''print(a.get_text())
print(b.get_text().strip())
print(c.get_text())
print(d.get_text(),end='')
print(e.get_text().strip())
print()'''
movie={}
movie['name']=a.get_text()
movie['star'] = b.get_text().strip()
movie['releasetime'] = c.get_text()
#movie['releasetime'] = d.get_text()
#movie['fraction'] = e.get_text()
#评分合成
f=d.get_text()+e.get_text()
movie['score']=f
ls.append(movie)
return ls
if __name__ == '__main__':
movie_list = []
for index in range(0, 10):
html = get_one_page(index)
movie_list += get_film(html)
# 所有抓取内容
print((movie_list))
将每次提取的内容放到movie字典中,存到movie_list列表中,最终的movie_list列表就包含了所有的电影信息。