python-爬虫-猫眼电影TOP100

#!/usr/bin/env python
#-*- coding:utf8 -*-
#__author__ = "LiDaguo"


import requests
import re
import xlwt

url = 'https://maoyan.com/board/4?'


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}   # 请求头



def get_page(url):
    '''输入网址-返回网页内容'''
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200: # 看看是不是正常访问
            return response.text
        else:
            print('获取网页失败')
    except Exception as e:
        print(e)


def get_info(page):
    '''得到网页-提取信息'''

    items = re.findall('board-index .*?>(\d+).*?class="name"><.*?>(.*?)

.*?

.*?' + '主演:(.*?) .*?

.*?

(.*?)

.*?

' + '(.*?)(\d+)

'
, page, re.S) # 构造正则表达式 # items是个列表,列表里的每个元素是个元组。每个元组里都包含各个电影的名称、演员、上映时间等信息 for item in items: data = {} data['rank'] = item[0] data['title'] = item[1] actors = re.sub('\n', '', item[2]) data['actors'] = actors data['date'] = item[3] data['score'] = str(item[4]) + str(item[5]) yield data urls = ['https://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(1)] DATA = [] for url in urls: page = get_page(url) datas = get_info(page) # datas是一个生成器,不是具体数 for item in datas: # 把生成器放到for语句中:每次的item都是生成器返回的值,这两个for循环遥相呼应 DATA.append(item) # 将所有的数据添加到DATA里 f = xlwt.Workbook(encoding='utf-8') # 创建表格 sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 命名sheet # 表头数据写入excle sheet01.write(0, 0, 'rank') # 第1行第1列 sheet01.write(0, 1, 'title') # 第1行第2列 sheet01.write(0, 2, 'actors') sheet01.write(0, 3, 'date') sheet01.write(0, 4, 'score') # 写内容 for i in range(len(DATA)): sheet01.write(i + 1, 0, DATA[i]['rank']) sheet01.write(i + 1, 1, DATA[i]['title']) sheet01.write(i + 1, 2, DATA[i]['actors']) sheet01.write(i + 1, 3, DATA[i]['date']) sheet01.write(i + 1, 4, DATA[i]['score']) print('爬取完成', end='') f.save('E:\\猫眼电影.xls')

效果:
python-爬虫-猫眼电影TOP100_第1张图片

你可能感兴趣的:(爬虫,项目)