python爬猫眼电影正在热映的电影详情

python爬猫眼电影正在热映的电影

这次咱们爬的是猫眼电影正在热映的电影。
网址:https://maoyan.com/
python爬猫眼电影正在热映的电影详情_第1张图片python爬猫眼电影正在热映的电影详情_第2张图片
以上图片中红色方框就是咱们要爬的内容,我们想要获取每部电影的详情页,我们首先要获取每部电影的id。例如:https://maoyan.com/films/248172,这个网址最后面的数字248172,就是这部电影的id。在这里插入图片描述

通过分析网页利用BeautifulSoup库来提取。好了废话不多说直接上代码吧!!!!!

import os
import time
import re
import requests
from fontTools.ttLib import TTFont
from fake_useragent import UserAgent
from bs4 import BeautifulSoup


os.makedirs('font', exist_ok=True)
regex_woff = re.compile("(?<=url\(').*\.woff(?='\))")
regex_text = re.compile('(?<=).*?(?=)')
regex_font = re.compile('(?<=&#x).{4}(?=;)')

basefont = TTFont('best.woff')
fontdict = {'uniEC6F': '0', 'uniF281': '8', 'uniE26F': '9', 'uniF285': '2', 'uniF587': '6',
            'uniE24D': '3', 'uniEA21': '1', 'uniF418': '4', 'uniEE9A': '5', 'uniF428': '7'}
kw = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'cookie': '__mta=142548621.1556031989221.1556071199547.1556071262768.11; uuid_n_v=v1; uuid=5E18E83065D911E9B9D3D192FF0E98664B9C761BB8BF4CF28BC82E199E033372; _lxsdk_cuid=16a4abbe58dc8-06c27a47a634dc-3d644509-144000-16a4abbe58ec8; _lxsdk=5E18E83065D911E9B9D3D192FF0E98664B9C761BB8BF4CF28BC82E199E033372; _csrf=d6a080de23a17493d88631f79fdf9a365096ec19a5dac7305040fc67c139a9ff; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; lt=9M5h_SC_J6vgc7iDePl7_r8t_4UAAAAAUggAAJ53ZxQ0W3h2wR6uTbsVMWFaI3lVmq2bzzSnQe11t7_F5HNRDhkibDH51r6jxJ1GBg; lt.sig=A5TyPIfNR-MANg63dsmEl2szBWQ; __mta=142548621.1556031989221.1556071262768.1556073153199.12; _lxsdk_s=16a4d0cf57f-d91-fe9-c74%7C%7C97'}


def get_fontnumber(newfont, text):
    #print (text)
    ms = regex_font.findall(text)
    #print (ms)
    for m in ms:
        text = text.replace(f"&#x{m};", get_num(newfont, f'uni{m.upper()}'))
        #print(text)
    return text


def get_num(newfont, name):
    uni = newfont['glyf'][name]
    for k, v in fontdict.items():
        if uni == basefont['glyf'][k]:
            return v


def downloads(url, localfn):
    with open(localfn, 'wb+') as sw:
        sw.write(requests.get(url).content)


def getHtml(url):
    try:
        r = requests.get(url, headers=kw,timeout=30)
        r.raise_for_status()
        #r.encoding = r.apparent_encoding
        # print(r.raise_for_status)
        #print(r.encoding)
        # print(r.text)
        return r.text
    except:
        return ""

#获取正在热映的的电影
def get_Movieid(ulist,url):
    html = getHtml(url)
    soup = BeautifulSoup(html, 'lxml')
    a = soup.find_all('a',target='_blank')              #提取a标签 获取到在正在上映电影的id 因为页面中href存在2个要对href去重
    for i in a[1:-11]:
        try:
            href = i.attrs['href'][7:]
            ulist.append(href)
        except:
            continue
    #print(ulist)
    return ulist
def movie_Info(ulist,info_url):
    ulist = list(set(ulist))
    try:
        for i in range(len(ulist)):
            url_info = info_url + ulist[i]
            #print(url_info)
            html=getHtml(url_info)
            soup = BeautifulSoup(html, 'html.parser')
            msg = {}
            msg['img'] = soup.find('div',class_='avatar-shadow').find('img').attrs['src']
            msg['title'] = soup.find('h3',class_='name').text
            msg['movie_type'] = soup.find('div',class_='movie-brief-container').find('ul').find_all('li')[0].text
            msg['movie_Country'] = soup.find('div',class_='movie-brief-container').find('ul').find_all('li')[1].text.split('/')[0].strip() #用replace()函数去掉空格
            msg['length'] = soup.find('div',class_='movie-brief-container').find('ul').find_all('li')[1].text.split('/')[1].strip()
            msg['movie_time'] = soup.find('div',class_='movie-brief-container').find('ul').find_all('li')[2].text
            #print (msg)
            dhtml = requests.get(url_info, headers=kw).text
            dsoup = BeautifulSoup(dhtml, 'lxml')
            # 下载字体文件
            woff = regex_woff.search(dhtml).group()
            wofflink = 'http:' + woff
            localname = 'font\\' + os.path.basename(wofflink)
            if not os.path.exists(localname):
                downloads(wofflink, localname)
            font = TTFont(localname)
            # 其中含有 unicode 字符,BeautifulSoup 无法正常显示,只能用原始文本通过正则获取
            ms = regex_text.findall(dhtml)
            if len(ms) < 3:
                msg['score'] = '0'
                msg['score-num'] = '0'
                msg['box-office'] = '0'
            else:
                msg['score'] = get_fontnumber(font, ms[0])
                msg['score-num'] = get_fontnumber(font, ms[1])
                msg['box-office'] = get_fontnumber(font, ms[2]) + dsoup.find('span', class_='unit').text
            print(msg)
    except:
        return "获取电影详情数据失败"
        
def main():
    lis_id = []
    url = 'https://maoyan.com/films'
    info_url = 'https://maoyan.com/films/'
    getHtml(url);
    get_Movieid(lis_id,url)
    movie_Info(lis_id,info_url)

if __name__ == '__main__':
    main()

运行结果:

python爬猫眼电影正在热映的电影详情_第3张图片

你可能感兴趣的:(python爬猫眼电影正在热映的电影详情)