Scrapy---在线爬取网页数据

1、items.py

class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 电影排名
    rank = scrapy.Field()
    # 电影标题
    title = scrapy.Field()
    picUrl = scrapy.Field()
    pass

2、shoubanspider.py

# -*- coding: utf-8 -*-
import scrapy
# from 工程名称.文件名 import 类名
from douban.items import DoubanItem
import sys
# 设置中文格式
utf8 = "utf-8"
if sys.getdefaultencoding() != utf8:
    reload(sys)
    sys.setdefaultencoding(utf8)
    pass

class DoubanspiderSpider(scrapy.Spider):
    name = 'doubanspider'
    allowed_domains = ['https://~~~~~~~要爬取的网页网址~~~~~~~~~']
    start_urls = ['https://~~~~~~~要爬取的网页网址~~~~~~~~~']

    def parse(self, response):
        # 找到爬取的数据,进行解析,获取到多条数据,
        items=response.xpath("//div[@class='item']")


        # 获取每一条数据,进行循环
        for item in items:
            doubanItem=DoubanItem()
            doubanItem['rank']=item.xpath('div[@class="pic"]/em/text()').extract()
            doubanItem['title']=item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()').extract()
            doubanItem['picUrl']=item.xpath('div[@class="pic"]/a/img/@src').extract()
            yield doubanItem
        pass

3、pipelines.py(数据处理)
(1)输出pipelines.py

class DoubanPipeline(object):
    def process_item(self, item, spider):
        print(item['rank'][0])
        print(item['title'][0])
        return item

(2)保存为txt格式pipelinestxt.py

import sys
# 配置中文环境
utf8='utf-8'
if sys.getdefaultencoding() != utf8:
    reload(sys)
    sys.setdefaultencoding(utf8)
    pass

import os


import time

class DoubanPipeline(object):
    # 定义构造函数
    def __init__(self):
        # 定义文件夹名
        self.folder_name="output"
        # 判断文件夹是否存在,不存在时创建文件夹
        if not os.path.exists(self.folder_name):
            os.mkdir(self.folder_name)
    def process_item(self, item, spider):
        print("———————————进行数据保存保存格式为txt————————")
        # 获取当前日期
        current_time=time.strftime('%Y_%m_%d',time.localtime())
        # 创建文件
        file_name='douban'+current_time+'.txt'
        try:
            with open(self.folder_name+'/'+file_name,'a',encoding='utf-8') as fp:
                fp.write('标题:'+item['rank'][0]+'\n')
                fp.write('内容:'+item['title'][0]+'\n\n')
        except IOError as err:
            print('error:'+str(err))
        finally:
            fp.close()
        return item

(3)保存为json格式pipelinesJson.py

import os
import time
import codecs
import json

class DoubanPipeline(object):
    # 定义构造函数
    def __init__(self):
        # 定义文件夹名
        self.folder_name="output"
        # 判断文件夹是否存在,如果不存在时创建文件夹
        if not os.path.exists(self.folder_name):
            os.mkdir(self.folder_name)
    def process_item(self, item, spider):
        print("———————————进行数据保存保存格式为json————————")
        # 获取当前时间
        current_data=time.strftime('%Y_%m_%d',time.localtime())
        file_name='moielist'+current_data+'.json'
        try:
            with codecs.open(self.folder_name+'/'+file_name,'a',encoding='utf-8') as fp:
                listDic=json.dumps(dict(item),ensure_ascii=False)+'\n'
                fp.write(listDic)
        except IOError as err:
            print('error:'+str(err))
        finally:
            fp.close()
        return item

(4)保存为xls格式

import os
import time
# 创建工作簿、创建sheet
import xlwt
# 打开
import xlrd
from xlutils.copy import copy

class DoubanPipeline(object):
    # 定义构造函数
    def __init__(self):
        # 定义文件夹名
        folder_name="output"
        # 判断文件夹是否存在,如果不存在时创建文件夹
        if not os.path.exists(folder_name):
            os.mkdir(folder_name)
        # 创建文件
        current_time=time.strftime('%Y-%m-%d',time.localtime())
        file_name='douban'+current_time+'.xls'
        self.excel_path=folder_name+'/'+file_name

        # 创建excel文件
        self.workbook=xlwt.Workbook(encoding='utf-8')
        # 创建excel下面的sheel
        self.doubansheet=self.workbook.add_sheet(u'电影数据')

        headers=[u'排名',u'标题']
        for i in range(len(headers)):
            self.doubansheet.write(0,i,headers[i])
        self.workbook.save(self.excel_path)
        self.index=1
    def process_item(self, item, spider):
        print("———————————进行数据保存保存格式为xls————————")
        # 打开工作簿
        oldwd=xlrd.open_workbook(self.excel_path,formatting_info=True)
        newWd=copy(oldwd)
        sheet=newWd.get_sheet(0)

        list1=[item['rank'],item['title']]
        for i in range(len(list1)):
            sheet.write(self.index,i,list1[i])
        # 保存
        newWd.save(self.excel_path)
        self.index=self.index+1
        return item

(5)保存图片

import os
import urllib.request
class DoubanPipeline(object):
    # 定义构造函数
    def __init__(self):
        # 定义文件夹名
        self.folder_name="images"
        # 判断文件夹是否存在,如果不存在时创建文件夹
        if not os.path.exists(self.folder_name):
            os.mkdir(self.folder_name)
    def process_item(self, item, spider):
        print("———————————保存图片————————")
        pic_url=item['picUrl'][0]
        image_name=pic_url.split('/')[-1]
        try:
            urllib.request.urlretrieve(pic_url,self.folder_name+"/%s" %image_name)
        except Exception as err:
            print('error:'+str(err))
        return item

(6)保存为数据库格式

import pymysql


class DoubanPipeline(object):
    def process_item(self, item, spider):
        print("———————————将数据保存到MySQL中————————")
        try:
            con=pymysql.connect(host='localhost',user='root',passwd='1',db='douban',port=3306,charset='utf8')
            if con:
                print('————————链接成功———————')                
                # 获取游标
                cur=con.cursor()
                if cur:
                    print('————————游标获取成功———————')
                    rank=item['rank'][0]
                    title=item['title'][0]
                    # sql添加语句
                    sql='insert into doubaninfo values(null,%s,%s)'
                    cur.execute(sql,(rank,title))
                    con.commit()
                    cur.close()
                else:
                    print('————————游标获取失败———————')
            else:
                print('————————链接失败———————')
        finally:
            con.close()
        return item

4、配置settings文件
请求头

DOWNLOADER_MIDDLEWARES = {
    'douban.middlewares.DoubanDownloaderMiddleware': 543,
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
    'douban.rotate_useragent.RotateUserAgentMiddleware' :400
}

执行顺序

ITEM_PIPELINES = {
   'douban.pipelines.DoubanPipeline': 300,
   'douban.pipelinestxt.DoubanPipeline': 301,
   'douban.pipelinesjson.DoubanPipeline': 302,
   'douban.pipelinesexcel.DoubanPipeline': 303,
   'douban.pipelinesmysql.DoubanPipeline': 304,
   'douban.pipelinespic.DoubanPipeline': 305,
}

你可能感兴趣的:(Scrapy---在线爬取网页数据)