豆瓣电影Top250爬取

豆瓣电影Top250爬取

爬取时间:2020年8月6日
编译器:PyCharm
技术路线:requests-bs4-re-xlwt
Url:https://movie.douban.com/top250
作者:YRH

如需转载,请标明出处

# -*- coding: utf-8 -*-
# Author : YRH
# Data : 
# Project : 
# Tool : PyCharm

import requests
from bs4 import BeautifulSoup
import re
import xlwt

headers = {
     
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}


#爬取网页
def getHtml(Url):
    try:
        rep = requests.get(Url, headers=headers)
        rep.raise_for_status()
        rep.encoding = rep.apparent_encoding
        #由于文档中有些解码编译不出来,所以在获取网站文档前将识别不出来的文本进行替换
        return rep.text.replace(u'\xee', u'').replace(u'\xf4', u'').replace(u'\xfb', u'')
    except:
        print("提取网页失败")

#解析数据
def parser(Html, info):
    soup = BeautifulSoup(Html, 'xml')
    ol = soup.find("ol", class_="grid_view").find_all_next("li")

    for li in ol:
        # 获取电影名称
        try:
            title = li.find_all("span", class_='title')[0].string
            # print(title)
        except:
            title = " "
            # print(title)

        # 获取导演director、主演actor、发布时间time、地点site、剧情plot

        # 找出所有信息,再利用正则表达式进行获取
        data = li.find("p", clsaa_="")

        # 导演
        try:
            director = re.findall(r'.*?导演: (.*?) .*?', str(data))[0]
        except:
            director = " "

        # 主演
        try:
            actor = re.findall(r'.*?主演: (.*?)/.*?', str(data))[0]
        except:
            actor = " "

        # 出版时间
        try:
            time = re.findall(r'(\d{4})', str(data))[0]
        except:
            time = " "

        # 地点
        try:
            site = re.findall(r'.*?(\d{4})/(.*?)/', str(data))[0][1]
        except:
            site = " "

        # 剧情
        try:
            plot = re.findall(r'
(.*?)
'
, str(data), re.S) plot = re.subn(" ", "", plot[0])[0] plot = plot.split("/") plot = plot[len(plot) - 1].replace("剧情","") except: plot = " " # 获取评分 try: grade = li.find("span", class_='rating_num').string except: grade = " " # 获取评论人数 try: comment = li.find("div", class_='star').find_all_next("span")[3].string except: comment = " " info.append([title, director, actor, time, site, plot, grade, comment]) def save(data): print("save.....") workbook = xlwt.Workbook(encoding="utf-8") #创建workbook对象 movieBook = workbook.add_sheet("sheet1") #创建工作表 #输入头标签 head = ["电影名","导演","主演","出版时间","地点","剧情","评分","评论人数"] for i in range(0,len(head)): movieBook.write(0,i,head[i]) #参数1是行,参数2是列,参数3是值 #数据逐行输入 y = 1 for a in data: print("成功抓取第"+str(y)+"部") for x in range(0,len(a)): movieBook.write(y,x,a[x]) y += 1 workbook.save("豆瓣电影排名top250.xls") #保存数据表 if __name__ == '__main__': info = [] start = 0 while start <= 225: url = "https://movie.douban.com/top250" + "?start=" + str(start) html = getHtml(url) parser(html, info) start += 25 save(info)

你可能感兴趣的:(python爬虫,python,数据挖掘)