01 python38的scrapy双色球爬虫

1 基本开发过程分析

1.0 scrapy框架流程图

01 python38的scrapy双色球爬虫_第1张图片

1.1安装

第一种:在命令行模式下使用pip命令即可安装:

$ pip install scrapy
第二种:首先下载,然后再安装:

$ pip download scrapy -d ./
# 通过指定国内镜像源下载 
$pip download  -i https://pypi.tuna.tsinghua.edu.cn/simple scrapy -d ./
进入下载目录后执行下面命令安装:

$ pip install Scrapy-1.5.0-py2.py3-none-any.whl

1.2使用

使用大概分为下面四步

1 创建一个scrapy项目

scrapy startproject mySpider
2 生成一个爬虫

scrapy genspider demo "demo.cn"
3 提取数据

完善spider 使用xpath等
4 保存数据

pipeline中保存数据

1.3 程序运行

在命令中运行爬虫

scrapy crawl qb     # qb爬虫的名字
在pycharm中运行爬虫

from scrapy import cmdline

2 双色球爬虫开发过程

2.0 网站分析

#网址:http://www.cwl.gov.cn/ygkj/wqkjgg/ssq/
#列表页分析:
http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=30&week=&systemType=PC
http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=2&pageSize=30&week=&systemType=PC
请求方式get
参数修改pageNo
返回数据为json


2.1 创建项目

scrapy startproject ssqSpider 
cd ssqSpider
scrapy genspider example example.com

2.2 创建爬虫

scrapy genspider ssq "cwl.gov.cn"

2.3 修改项目配置

项目目录结构如下:
scrapy.cfg :项目的配置文件

ssqSpider/ :项目的Python模块,将会从这里引用代码

ssqSpider/items.py :项目的目标文件

ssqSpider/pipelines.py :项目的管道文件

ssqSpider/settings.py :项目的设置文件

ssqSpider/spiders/ :存储爬虫代码目录

2.4 添加爬虫模型

import scrapy


class SsqspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    qihao = scrapy.Field() #期号
    riqi = scrapy.Field() #日期
    kaijianghaoma = scrapy.Field() #开奖号码
    ydj_zhushu = scrapy.Field()#一等奖注数
    edj_zhushu = scrapy.Field()#二等奖注数
    xiaoshouer = scrapy.Field()#销售额
    jiangchijiner = scrapy.Field()#奖池金额

2.5 修改爬虫代码

import scrapy
from scrapy.http import HtmlResponse
from scrapy import Request
from ssqSpider.items import SsqspiderItem


class SsqSpider(scrapy.Spider):
    name = "ssq"
    allowed_domains = ["www.cwl.gov.cn"]
    start_urls = ["https://cwl.gov.cn"]
    # start_urls = ["http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=30&week=&systemType=PC"]

    def start_requests(self):
        # lst=[]
        for page in range(1,3):   ## 1-54页
            url=f'http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo={page}&pageSize=30&week=&systemType=PC'        
            # lst.append(Request(url=url,dont_filter=True))
            yield Request(url=url,dont_filter=True)  # 如果有302跳转加入dont_filter=True
        # return lst

    def parse(self, response:HtmlResponse):
        # print('test###########################################')
        # print(response.json())
        
        data=response.json()
        result=data['result']
        for i in result:
            item=SsqspiderItem()
            # print(i)
            item['qihao']=i['code']
            item['riqi']=i['date']
            item['kaijianghaoma_red']=i['red']
            item['kaijianghaoma_blue']=i['blue']
            
            item['jiangchijiner']=i['poolmoney']
            item['xiaoshouer']=i['sales']

            yield item

2.5.1 测试运行爬虫

scrapy crawl ssq
或者
scrapy crawl ssq -o ssq.csv

2.6 保存到数据库

2.6.1 写入到excel工作簿

# 安装openpyxl包
pip install openpyxl
pip list

修改Pipeline

import openpyxl

from itemadapter import ItemAdapter
from ssqSpider.items import SsqspiderItem


class SsqspiderPipeline:
    def __init__(self):
        self.wb=openpyxl.Workbook()
        self.ws=self.wb.active
        self.ws.title='双色球开奖信息'
        self.ws.append(['期号','日期','红球','篮球','销售额','奖池'])

    def close_spider(self,spider):
        self.wb.save('双色球开奖信息.xlsx')

    def process_item(self, item, spider):
        if not isinstance(spider,SsqSpider):
            return item
        qihao=item.get('qihao','')
        riqi=item.get('riqi','')
        kaijianghaoma_red=item.get('kaijianghaoma_red','')
        kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
        jiangchijiner=item.get('jiangchijiner','')
        xiaoshouer=item.get('xiaoshouer','')   
        self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])   
        return item

修改pipeline配置启用ssqSpider.pipelines.SsqspiderPipeline

ITEM_PIPELINES = {
   "ssqSpider.pipelines.SsqspiderPipeline": 300,
}

运行代码

scrapy crawl ssq

2.6.2 写入到mysql

创建数据库spider_db或者ssq_db
创建表ssq_info

use `spider_db`;
drop table if EXISTS `ssq_info`;

create table `ssq_info`(
  `id` int unsigned auto_increment COMMENT '编号',
	`qihao` VARCHAR(50) not null COMMENT '期号',
  `riqi` VARCHAR(50) COMMENT '日期',
  `kaijianghaoma_red` VARCHAR(50) COMMENT '红球',
  `kaijianghaoma_blue` VARCHAR(50) COMMENT '蓝球',
  `jiangchi` DECIMAL(12,2) COMMENT '奖池',
  `xiaoshouer` DECIMAL(12,2) COMMENT '销售额',
   PRIMARY key(`id`)
) ENGINE=INNODB COMMENT='双色球历史信息';

使用

#安装pymysql
pip install pymysql

#创建新的处理管道,有批量和非批量插入,选择一种即可
import openpyxl
import pymysql

from itemadapter import ItemAdapter


class SsqspiderPipeline:
    def __init__(self):
        self.wb=openpyxl.Workbook()
        self.ws=self.wb.active
        self.ws.title='双色球开奖信息'
        self.ws.append(['期号','日期','红球','篮球','销售额','奖池'])

    def close_spider(self,spider):
        self.wb.save('双色球开奖信息.xlsx')

    def process_item(self, item, spider):

        qihao=item.get('qihao','')
        riqi=item.get('riqi','')
        kaijianghaoma_red=item.get('kaijianghaoma_red','')
        kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
        jiangchijiner=item.get('jiangchijiner','')
        xiaoshouer=item.get('xiaoshouer','')   
        self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])   
        return item


class SsqMysqlDbPipeline:
    def __init__(self):
        self.conn=pymysql.connect(host='localhost',port=3306,
                            user='root',password='123456',database='spider_db',
                            charset='utf8mb4')
        self.curson=self.conn.cursor()
       

    def close_spider(self,spider):
        self.conn.commit()
        self.conn.close()
        

    def process_item(self, item, spider):

        qihao=item.get('qihao','')
        riqi=item.get('riqi','')
        kaijianghaoma_red=item.get('kaijianghaoma_red','')
        kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
        jiangchi=item.get('jiangchijiner',0)
        xiaoshouer=item.get('xiaoshouer',0)   

        self.curson.execute('insert into ssq_info(qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer) values(%s,%s,%s,%s,%s,%s)',
                            (qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer))

        # self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])   
        return item


class SsqMysqlDbBatchPipeline:
    def __init__(self):
        self.conn=pymysql.connect(host='localhost',port=3306,
                            user='root',password='123456',database='spider_db',
                            charset='utf8mb4')
        self.curson=self.conn.cursor()
        self.data=[]
       

    def close_spider(self,spider):
        if len(self.data)>0:
            self._write_to_db()
        self.conn.close()
        

    def process_item(self, item, spider):
        qihao=item.get('qihao','')
        riqi=item.get('riqi','')
        kaijianghaoma_red=item.get('kaijianghaoma_red','')
        kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
        jiangchi=item.get('jiangchijiner',0)
        xiaoshouer=item.get('xiaoshouer',0)   
        self.data.append((qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer))

        if len(self.data)==30:
            self._write_to_db()
            self.data.clear()
        # self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])   
        return item

    def _write_to_db(self):       
        self.curson.executemany('insert into ssq_info(qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer) values(%s,%s,%s,%s,%s,%s)',self.data)    
        self.conn.commit()

修改pipeline配置

ITEM_PIPELINES = {
   # "ssqSpider.pipelines.SsqspiderPipeline": 300,
   # "ssqSpider.pipelines.SsqMysqlDbPipeline":300,
   "ssqSpider.pipelines.SsqMysqlDbBatchPipeline":300
}

你可能感兴趣的:(python38_scrapy,scrapy,爬虫)