python爬取豆瓣电影Top250

1.获取网页单页内容

def askURL(url):
    head = {
     
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    } 

    req = urllib.request.Request(url,headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

    return html

2.解析并获取所需内容

def getData(baseurl,rule):
    datalist = []
    for i in range(25):
        url = baseurl + str(i*25)
        html = askURL(url)

        #  逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all("div", class_="item"):
            # print(item)
            # break
            item = str(item)
            data_rule = re.findall(rule, item)
            #print(data_rule)
            if len(data_rule)>0:
                find_rule = data_rule[0]
                find_rule = re.sub(r'<.*?>',' ',find_rule)  # 去掉便签或者/
                find_rule = re.sub(r'\xa0','',find_rule)
            else:
                find_rule = "无"
            datalist.append(find_rule.strip())   # 去掉前后空格
        print("抓取第{0}页{1}数据完成!".format(i+1,str(rule)))

    return datalist

3.保存为xls

def saveData(data,savepath):
    book = xlwt.Workbook(encoding='utf-8')  # 创建workbook对象
    sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)  # Ture为每次覆盖以前内容
    col = ('电影超链接','电影图片','影片中文名','评分','评价人数','概况')
    for i in range(0,len(col)):   # 写列名
        print("写%d列" % i)
        sheet.write(0,i,col[i])
        for j in range(0,len(data[i])):
            sheet.write(j+1,i,data[i][j])
    book.save(savepath)

4.全部代码

#-*- coding = utf-8 -*-
# @Time: 2021/2/14 13:42 
# @Auther: woniu
# @File: spider.py
# @Software: PyCharm

import re           # 网页解析
import urllib.request       # 正则表达式,进行文字匹配
import urllib.error
from bs4 import BeautifulSoup         # 制定URL,获取网页数据
import xlwt         # 进行excel操作
import sqlite3      # 进行SQLite数据库操作

# 主函数
def main():
    baseurl = "https://movie.douban.com/top250?start="
    data=[]
    find_rule = {
     
        "link":r'a href="(.*?)">',
        "img":r'img.*src="(.*?)"',
        "title":r'span class="title">(.*?)',
        "score":r'(.*?)',
        "judge":r'(\d*?)人评价',
        "inq":r'(.*?)'
    }
    for key in find_rule:
        print("开始获取电影的{0}".format(str(key)))
        result = re.compile(find_rule[key],re.S)
        data_result = getData(baseurl,result)
        data.append(data_result)
    savepath = "豆瓣电影Top250.xls"
    saveData(data,savepath)

# 爬取网页
def getData(baseurl,rule):
    datalist = []
    for i in range(25):
        url = baseurl + str(i*25)
        html = askURL(url)

        #  逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all("div", class_="item"):
            # print(item)
            # break
            item = str(item)
            data_rule = re.findall(rule, item)
            #print(data_rule)
            if len(data_rule)>0:
                find_rule = data_rule[0]
                find_rule = re.sub(r'<.*?>',' ',find_rule)  # 去掉便签或者/
                find_rule = re.sub(r'\xa0','',find_rule)
            else:
                find_rule = "无"
            datalist.append(find_rule.strip())   # 去掉前后空格
        print("抓取第{0}页{1}数据完成!".format(i+1,str(rule)))

    return datalist

# 得到指定一个URL的网页内容
def askURL(url):
    head = {
     
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    }        # 用户代理,表示告诉豆瓣服务器,我们是什么类型的浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)

    req = urllib.request.Request(url,headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

    return html

# 保存数据到xls
def saveData(data,savepath):
    book = xlwt.Workbook(encoding='utf-8')  # 创建workbook对象
    sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)  # Ture为每次覆盖以前内容
    col = ('电影超链接','电影图片','影片中文名','评分','评价人数','概况')
    for i in range(0,len(col)):   # 写列名
        print("写%d列" % i)
        sheet.write(0,i,col[i])
        for j in range(0,len(data[i])):
            sheet.write(j+1,i,data[i][j])
    book.save(savepath)

if __name__ == '__main__':
    main()

5.执行效果

python爬取豆瓣电影Top250_第1张图片

你可能感兴趣的:(python,爬虫)