python爬取豆瓣电影Top250

  1. 先爬取网页,将html文件保存在本地
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time    : 2019/6/6 21:00
    # @Author  : LLY
    # @File    : TopSoup.py
    
    import requests
    
    strHeaders='''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    Accept-Encoding: utf-8 这里注意要修改一下不然服务器给你返回一个人二进制压缩包你去解吧
    Accept-Language: zh-CN,zh;q=0.9
    Cache-Control: max-age=0
    Connection: keep-alive
    Cookie: bid=uDL96vW6pcI; ap_v=0,6.0; __utma=30149280.513669709.1559825996.1559825996.1559825996.1; __utmb=30149280.0.10.1559825996; __utmc=30149280; __utmz=30149280.1559825996.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.309452895.1559825996.1559825996.1559825996.1; __utmb=223695111.0.10.1559825996; __utmc=223695111; __utmz=223695111.1559825996.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1559825996%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DW7TmugqrBmn98I5Q3R9dmUPxuxaBbZf_A7vfb4KDpChv9WsqTOQQecCONQz6F-dG%26wd%3D%26eqid%3Dfc7c96e90016896a000000055cf90e44%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=c71ed26a379b1502.1559825996.1.1559826028.1559825996.; __yadk_uid=5BUDZmK5DfLGphSLzHQ6wJQVbECiPFH6
    Host: movie.douban.com
    Referer: https://movie.douban.com/top250?start=250&filter=
    Upgrade-Insecure-Requests: 1
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'''
    #转换头部
    endHeaders=dict(line.split(': ',1) for line in strHeaders.split('\n'))
    for index in range(0,275,25):
        startUrl = 'https://movie.douban.com/top250?start=%d&filter='%index
        req=requests.get(url=startUrl,headers=endHeaders)
        with open('F:\ProgramWork\PyCharmProjects\pandasTest\Top250\%d.html'%index,'w',encoding='utf8')as f:
            f.write(req.text)
    
  2. 读取刚刚保存的HTML文件 并且解析他,我这里用的是bs4库 
  3. 但我个人认为xpath用起来更加顺手
  4. #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time    : 2019/6/7 9:25
    # @Author  : LLY
    # @File    : paserData.py
    import re
    from bs4 import BeautifulSoup
    import os
    import xlwt
    
    path1='F:\ProgramWork\PyCharmProjects\pandasTest\Top250'
    nameFile='Top250.xls'
    wb = xlwt.Workbook(encoding='utf-8')
    ws = wb.add_sheet('榜单')
    headSheet = ['电影名称', '评分', '别名①', '别名②','演员', '电影类型', '电影海报', '详情地址']
    index1=1
    for column in range(0, 8):
        ws.write(r=0, c=column, label=headSheet[column], style=xlwt.easyxf('font: bold on'))
    for path2 in os.listdir(path1):
        filePath=os.path.join(path1,path2)
        soup=BeautifulSoup(open(filePath,encoding='utf-8'),'lxml',from_encoding='utf-8')
        #select使用css选择器返回一个list 这个list里面类型为bs4elementTag
        for index in range(25):
            movieData=[]
            soupSub=soup.select('div#wrapper>div#content>div>div.article>ol>li:nth-child({})>div.item'.format(index+1))[0]
            poster=soupSub.find('div',class_='pic').find('img').attrs['src']
            player=soupSub.select('div.info>div.hd')[0].find('a').attrs['href']
            mName=soupSub.select('div.info>div.hd>a>span')
            movieName,secondName,threeName=None,None,None
            if len(mName)==2:
                reg1=re.compile(' *|\\xa0|/')
                movieName=reg1.sub('',mName[0].get_text())
                secondName=reg1.sub('',mName[1].get_text())
            elif len(mName)==3:
                reg1 = re.compile(' *|\\xa0|/')
                movieName = reg1.sub('',mName[0].get_text())
                secondName = reg1.sub('',mName[1].get_text())
                threeName=reg1.sub('',mName[2].get_text())
            people=soupSub.select('div.info>div.bd>p:nth-child(1)')[0].get_text()
            reg=re.compile(' *|\\xa0+|/+|^\n|')
            actorDataTemp=reg.sub('',people).split('\n')
            actor=actorDataTemp[0]
            movieType=actorDataTemp[1]
            score=soupSub.select('div.info>div.bd>div.star>span.rating_num')[0].get_text()
            quote=soupSub.select('div.info>div.bd>p.quote>span.inq')[0].get_text()
            movieData.append(movieName)
            movieData.append(score)
            movieData.append(secondName)
            movieData.append(threeName)
            movieData.append(actor)
            movieData.append(movieType)
            movieData.append(poster)
            movieData.append(player)
            for itemIndex in range(len(movieData)):
                ws.write(r=index1, c=itemIndex, label=movieData[itemIndex])
            index1+=1
    wb.save(nameFile)
  5. python爬取豆瓣电影Top250_第1张图片python爬取豆瓣电影Top250_第2张图片

你可能感兴趣的:(Python爬虫,豆瓣电影Top250,beautifulsoup,select选择器)