python:猫眼电影TOP100的电影爬取

'''
爬取猫眼电影上TOP100的电影
技术路线:requests  bs4  re
'''
import requests
from bs4 import BeautifulSoup
import re


def getHTML(url):  # 爬取页面
    hd = {'User-Agent': 'Mozilla/5.0'}  # 模拟浏览器进行爬取
    try:
        r = requests.get(url, headers=hd)
        r.raise_for_status()  # 如果状态不是200,抛出HTTPError异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('爬取失败')


def parsePage(ulist, html):  # 解析HTML页面
    soup = BeautifulSoup(html, "html.parser")
    # 电影名
    films = soup.find_all('p', attrs={'class': 'name'})  # 找到p标签并且属性为class='name' ,这里想找的就是电影名所在的标签

    # 排名
    rank = re.findall(r'board-index-(\d*)', html, re.S)  # (\d*),加上了括号相当于只返回括号内的内容,如果不加()返回匹配到的整个字符串

    # 主角
    actors1 = soup.find_all('p', attrs={'class': 'star'})  # 返回的是一个列表
    actors2 = []  #
    for item in actors1:  # 遍历
        a = item.string.replace('\n', '')  # 因为存在换行和空格,在后面的操作不好进行,所以将其消去
        b = a.replace(' ', '')
        actors2.append(b)  # 重新整理后为actor2

    # 上映日期,地点
    rate = soup.find_all('p', attrs={'class': "releasetime"})

    # 装入ulist中
    for i in range(len(films)):
        ulist.append([rank[i], films[i].string, actors2[i], rate[i].string])


def PrintList(ulist):
    tplt = '{0:<4}\t{1:{4}<15}\t{2:{4}<30}{3:{4}<50}'
    print(tplt.format('排名', "电影", '主演', '上映日期', chr(12288)))
    for item in ulist:
        print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))


def Savetxt(ulist):  # 保存为txt文件
    with open('films.txt', 'w', encoding="gbk") as f:
        for item in ulist:
            items = '{},{},{},{}'.format(item[0], item[1], item[2], item[3])
            f.write(items)
            f.write('\n')


def SaveExcle(ulist):  # 保存为xls文件
    with open('fimls.xls', 'w', encoding='gbk') as f:
        for i in range(len(ulist)):
            for j in range(len(ulist[i])):
                f.write(str(ulist[i][j]))
                f.write('\t')  # 换下一个单元格
            f.write('\n')


def main():
    starturl = 'http://maoyan.com/board/4?offset=0'
    lsoffset = []
    ulist = []
    #
    # html = getHTML(starturl)
    # parsePage(ulist, html)
    # PrintList(ulist)
    for i in range(10):
        lsoffset.append(str(i * 10))  # 用来设置翻页的url
    for offset in lsoffset:  # 这里意思就是找到所有的页面
        url = starturl + offset
        html = getHTML(url)
        parsePage(ulist, html)
    PrintList(ulist)
    SaveExcle(ulist)
    Savetxt(ulist)


main()

BeautifulSoup与正则表达式要会灵活运用

你可能感兴趣的:(网络爬虫)