import requests
from bs4 import BeautifulSoup
def get_movies():
headers={'User-Agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1','Host':'movie.douban.com'}
movie_list=[]
for i in range(0,10):
link='https://movie.douban.com/top250?start='+str(i*25)
r=requests.get(link,headers=headers,timeout=10)
print(str(i+1),"页响应状态码:",r.status_code)
soup=BeautifulSoup(r.text,"lxml")
div_list=soup.find_all('div',class_='hd' or 'bd')
for each in div_list:
movie=each.a.span.text.strip()
movie_list.append(movie)
return movie_list
movies=get_movies()
f = open('top250.txt', 'w') #清空文件内容再写
for i in range(0,len(movies)):
f.writelines([movies[i],'\n'])
#print(movies[i])
f.close()
f = open('top250.txt','r')
#print(f.readlines())
for eachline in f:
print(eachline.strip())
f.close()
其中r.text内容大致是这样的:
-
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...
1994 / 美国 / 犯罪 剧情
1612528人评价
希望让人自由。
进阶一点
from selenium import webdriver
import time
import urllib.request
import re
from bs4 import BeautifulSoup
import codecs
page = urllib.request.urlopen("https://movie.douban.com/top250")
contents = page.read()
soup = BeautifulSoup(contents,"html.parser")
# driver = webdriver.Chrome("chromedriver.exe") # chromedriver所在路径
# driver.get(r"https://movie.douban.com/top250")
mov_list=soup.find_all(attrs={"class":"item"})
for each in mov_list:
movname=each.find(attrs={"class":"title"}).get_text()
print('电影名:',movname)
rate=each.find(attrs={"class":"rating_num"}).get_text()
print('评分:',rate)
comment=each.find(attrs={"class":"quote"}).get_text()
print('评论:',comment)
结果如下:
参考链接:https://blog.csdn.net/u010885059/article/details/53939659
或者这样使用BeautifulSoup写
import requests
from bs4 import BeautifulSoup
f = open('top250.txt', 'w+',encoding='utf-8') #追加方式写文件
for i in range(0,10):
link="https://movie.douban.com/top250?start="+str(i*25)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
r=requests.get(link,headers=headers)
soup=BeautifulSoup(r.text,"lxml")
#mov_list=soup.find_all(attrs={"class":"item"})
mov_list=soup.find_all(class_="item")
for each in mov_list:
number=each.find(attrs={"class":"pic"}).em.text.strip()
print('排名:',number)
movname=each.find(attrs={"class":"title"}).get_text().strip()
print('电影名:',movname)
#.p.text的含义是:提取元素中的文字,strip()的功能是把字符串左右的空格去掉
director=each.find(attrs={"class":"bd"}).p.text.strip().replace(" ","").strip().replace("\n","").strip().replace("...","").strip().replace("/","")
print(director)
rate=each.find(attrs={"class":"rating_num"}).get_text()
print('评分:',rate)
comment=each.find(attrs={"class":"quote"}).get_text().strip()
print('评论:',comment)
f.writelines([number,'\n',movname,'\n',director,'\n',rate,'\n',comment,'\n'])
f.close()
程序运行结果: