Python爬取猫眼电影top100数据

最近研究爬虫,自己写了几个小项目,爬取猫眼电影top100的数据就是其中之一

先去解析一下页面,发现一共有10页,随便切了两页,发现他的一个url组成大致是

https://maoyan.com/board/4?offset=页面*10

然后写一个函数用来获取页面url

def get_pages_url_list():
    pages_url_list = []
    for i in range(10):
        url = 'https://maoyan.com/board/4?offset=%s'%str(i*10)
        pages_url_list.append(url)
    return pages_url_list

这样就得到了网页url的列表
接下来解析一下网页,发现想要的信息都在源码里,举一个例子Python爬取猫眼电影top100数据_第1张图片
这样可以直接通过正则表达式将信息筛选出来
每一种信息的获取,都写一个函数

获取电影名`

def movie_name():
    namelist = []
    for pages in pages_url:
        try:
            html = requests.get(pages).text
            reg = re.compile(r'')
            result = re.findall(reg,html)
            for res in result:
                namelist.append(res)
        except:
            pass
    return namelist

输出一下,发现什么都没有,获取到的是一个空列表
简单看了一下获取的源码,发现这猫眼有反爬虫机制
应该要加headers请求
打开谷歌浏览器的network,刷新一下,想要的便都有了
把headers复制过来

headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0'
                        '.3987.116 Safari/537.36',
          'Cookie': '__mta=252583478.1582162970276.1582251329773.1582251337605.36; uuid_n_v=v1; uuid=49C96250538211EAA'
                    '1F029AD4767FB60F69E29B324164685A575DBC49D9F7D7B; _csrf=22354046805a970e2742e52e9f2f0f6742e9c2f0b66'
                    '79d37bfa4d63b3f042b0d; _lxsdk_cuid=1706042fa27b9-01be66e73d352-313f68-100200-1706042fa28c8; _lxsdk'
                    '=49C96250538211EAA1F029AD4767FB60F69E29B324164685A575DBC49D9F7D7B; mojo-uuid=0dc740302bb9b347859b6'
                    '5d4fe0e70e4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1582162967,1582167037,1582167046; mojo-sessio'
                    'n-id={"id":"63862b9f8ff58455ee319c623bc3ba4d","time":1582250701907}; __mta=252583478.158216297027'
                    '6.1582182174928.1582250732005.29; mojo-trace-id=16; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1582'
                    '251337; _lxsdk_s=170657db4a6-42f-ab4-e3a%7C%7C22'
}

这样就可以了

完整代码如下

import requests
import re

headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0'
                        '.3987.116 Safari/537.36',
          'Cookie': '__mta=252583478.1582162970276.1582251329773.1582251337605.36; uuid_n_v=v1; uuid=49C96250538211EAA'
                    '1F029AD4767FB60F69E29B324164685A575DBC49D9F7D7B; _csrf=22354046805a970e2742e52e9f2f0f6742e9c2f0b66'
                    '79d37bfa4d63b3f042b0d; _lxsdk_cuid=1706042fa27b9-01be66e73d352-313f68-100200-1706042fa28c8; _lxsdk'
                    '=49C96250538211EAA1F029AD4767FB60F69E29B324164685A575DBC49D9F7D7B; mojo-uuid=0dc740302bb9b347859b6'
                    '5d4fe0e70e4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1582162967,1582167037,1582167046; mojo-sessio'
                    'n-id={"id":"63862b9f8ff58455ee319c623bc3ba4d","time":1582250701907}; __mta=252583478.158216297027'
                    '6.1582182174928.1582250732005.29; mojo-trace-id=16; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1582'
                    '251337; _lxsdk_s=170657db4a6-42f-ab4-e3a%7C%7C22'
}

def get_pages_url_list():
    pages_url_list = []
    for i in range(10):
        url = 'https://maoyan.com/board/4?offset=%s'%str(i*10)
        pages_url_list.append(url)
    return pages_url_list

pages_url = get_pages_url_list()

def movie_name():
    namelist = []
    for pages in pages_url:
        try:
            html = requests.get(pages,headers = headers).text
            reg = re.compile(r'')
            result = re.findall(reg,html)
            for res in result:
                namelist.append(res)
        except:
            pass
    return namelist

def movie_actor():
    actorlist = []
    for pages in pages_url:
        try:
            html = requests.get(pages,headers = headers).text
            reg = re.compile(r'主演:.*')
            result = re.findall(reg,html)
            for res in result:
                actorlist.append(res)
        except:
            pass
    return actorlist

def movie_time():
    timelist = []
    for pages in pages_url:
        try:
            html = requests.get(pages,headers = headers).text
            reg = re.compile(r'

(.*?)

') result = re.findall(reg,html) for res in result: timelist.append(res) except: pass return timelist def movie_score(): scorelist =[] for pages in pages_url: try: html = requests.get(pages,headers = headers).text reg = re.compile(r'

(.*?)(.*?)

') result = re.findall(reg,html) for i in range(len(result)): s = result[i] v = s[0] + s[1] scorelist.append(v) except: pass return scorelist def movie_photo(): photolist = [] for pages in pages_url: try: html = requests.get(pages,headers = headers).text reg = re.compile(r'.*') result = re.findall(reg,html) for res in result: photolist.append(res) except: pass return photolist def main(): movie_names = movie_name() movie_actors = movie_actor() movie_times = movie_time() movie_scores = movie_score() movie_photos = movie_photo() result = [] for i in range(len(movie_names)): res = '%s\t\t%s\t\t%s\t\t评分:%s\t\t图片:%s'%(movie_names[i],movie_actors[i],movie_times[i],movie_scores[i],movie_photos[i]) result.append(res) return result if __name__ == '__main__': file = open('猫眼电影top100.txt','w',encoding='utf-8') num = 1 for item in main(): file.write(str(num)) file.write('\t\t') file.write(item) file.write('\n') print('写入成功:\t%s'%item) num += 1

你可能感兴趣的:(Python爬取猫眼电影top100数据)