python爬虫豆瓣top250电影,评分,评论等

import requests
from bs4 import BeautifulSoup
def get_movies():
    headers={'User-Agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1','Host':'movie.douban.com'}
    movie_list=[]
    for i in range(0,10):
        link='https://movie.douban.com/top250?start='+str(i*25)
        r=requests.get(link,headers=headers,timeout=10)
        print(str(i+1),"页响应状态码:",r.status_code)
        soup=BeautifulSoup(r.text,"lxml")
        div_list=soup.find_all('div',class_='hd' or 'bd')
        for each in div_list:
            movie=each.a.span.text.strip()
            movie_list.append(movie)
    return movie_list
movies=get_movies()
f = open('top250.txt', 'w') #清空文件内容再写
for i in range(0,len(movies)):
    f.writelines([movies[i],'\n'])
    #print(movies[i])
f.close()
f = open('top250.txt','r')
#print(f.readlines())
for eachline in f:
    print(eachline.strip())
f.close()

python爬虫豆瓣top250电影,评分,评论等_第1张图片

其中r.text内容大致是这样的:

  1. 1 肖申克的救赎

    导演: 弗兰克·德拉邦特 Frank Darabont   主演: 蒂姆·罗宾斯 Tim Robbins /...
    1994 / 美国 / 犯罪 剧情

    9.7 1612528人评价

    希望让人自由。

进阶一点

from selenium import webdriver
import time
import urllib.request
import re
from bs4 import BeautifulSoup
import codecs
page = urllib.request.urlopen("https://movie.douban.com/top250")
contents = page.read()
soup = BeautifulSoup(contents,"html.parser")
# driver = webdriver.Chrome("chromedriver.exe")  # chromedriver所在路径
# driver.get(r"https://movie.douban.com/top250")
mov_list=soup.find_all(attrs={"class":"item"})
for each in mov_list:
    movname=each.find(attrs={"class":"title"}).get_text()
    print('电影名:',movname)
    rate=each.find(attrs={"class":"rating_num"}).get_text()
    print('评分:',rate)
    comment=each.find(attrs={"class":"quote"}).get_text()
    print('评论:',comment)

结果如下:

python爬虫豆瓣top250电影,评分,评论等_第2张图片

参考链接:https://blog.csdn.net/u010885059/article/details/53939659 

或者这样使用BeautifulSoup写 

import requests
from bs4 import BeautifulSoup
f = open('top250.txt', 'w+',encoding='utf-8') #追加方式写文件
for i in range(0,10):
    link="https://movie.douban.com/top250?start="+str(i*25)
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
    r=requests.get(link,headers=headers)
    soup=BeautifulSoup(r.text,"lxml")
    #mov_list=soup.find_all(attrs={"class":"item"})
    mov_list=soup.find_all(class_="item")
    for each in mov_list:
        number=each.find(attrs={"class":"pic"}).em.text.strip()
        print('排名:',number)
        movname=each.find(attrs={"class":"title"}).get_text().strip()
        print('电影名:',movname)
        #.p.text的含义是:提取

元素中的文字,strip()的功能是把字符串左右的空格去掉 director=each.find(attrs={"class":"bd"}).p.text.strip().replace(" ","").strip().replace("\n","").strip().replace("...","").strip().replace("/","") print(director) rate=each.find(attrs={"class":"rating_num"}).get_text() print('评分:',rate) comment=each.find(attrs={"class":"quote"}).get_text().strip() print('评论:',comment) f.writelines([number,'\n',movname,'\n',director,'\n',rate,'\n',comment,'\n']) f.close()

程序运行结果:

python爬虫豆瓣top250电影,评分,评论等_第3张图片

 

你可能感兴趣的:(python爬虫学习)