第一种:在命令行模式下使用pip命令即可安装:
$ pip install scrapy
第二种:首先下载,然后再安装:
$ pip download scrapy -d ./
# 通过指定国内镜像源下载
$pip download -i https://pypi.tuna.tsinghua.edu.cn/simple scrapy -d ./
进入下载目录后执行下面命令安装:
$ pip install Scrapy-1.5.0-py2.py3-none-any.whl
使用大概分为下面四步
1 创建一个scrapy项目
scrapy startproject mySpider
2 生成一个爬虫
scrapy genspider demo "demo.cn"
3 提取数据
完善spider 使用xpath等
4 保存数据
pipeline中保存数据
在命令中运行爬虫
scrapy crawl qb # qb爬虫的名字
在pycharm中运行爬虫
from scrapy import cmdline
#网址:http://www.cwl.gov.cn/ygkj/wqkjgg/ssq/
#列表页分析:
http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=30&week=&systemType=PC
http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=2&pageSize=30&week=&systemType=PC
请求方式get
参数修改pageNo
返回数据为json
scrapy startproject ssqSpider
cd ssqSpider
scrapy genspider example example.com
scrapy genspider ssq "cwl.gov.cn"
项目目录结构如下:
scrapy.cfg :项目的配置文件
ssqSpider/ :项目的Python模块,将会从这里引用代码
ssqSpider/items.py :项目的目标文件
ssqSpider/pipelines.py :项目的管道文件
ssqSpider/settings.py :项目的设置文件
ssqSpider/spiders/ :存储爬虫代码目录
import scrapy
class SsqspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
qihao = scrapy.Field() #期号
riqi = scrapy.Field() #日期
kaijianghaoma = scrapy.Field() #开奖号码
ydj_zhushu = scrapy.Field()#一等奖注数
edj_zhushu = scrapy.Field()#二等奖注数
xiaoshouer = scrapy.Field()#销售额
jiangchijiner = scrapy.Field()#奖池金额
import scrapy
from scrapy.http import HtmlResponse
from scrapy import Request
from ssqSpider.items import SsqspiderItem
class SsqSpider(scrapy.Spider):
name = "ssq"
allowed_domains = ["www.cwl.gov.cn"]
start_urls = ["https://cwl.gov.cn"]
# start_urls = ["http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=30&week=&systemType=PC"]
def start_requests(self):
# lst=[]
for page in range(1,3): ## 1-54页
url=f'http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo={page}&pageSize=30&week=&systemType=PC'
# lst.append(Request(url=url,dont_filter=True))
yield Request(url=url,dont_filter=True) # 如果有302跳转加入dont_filter=True
# return lst
def parse(self, response:HtmlResponse):
# print('test###########################################')
# print(response.json())
data=response.json()
result=data['result']
for i in result:
item=SsqspiderItem()
# print(i)
item['qihao']=i['code']
item['riqi']=i['date']
item['kaijianghaoma_red']=i['red']
item['kaijianghaoma_blue']=i['blue']
item['jiangchijiner']=i['poolmoney']
item['xiaoshouer']=i['sales']
yield item
scrapy crawl ssq
或者
scrapy crawl ssq -o ssq.csv
# 安装openpyxl包
pip install openpyxl
pip list
修改Pipeline
import openpyxl
from itemadapter import ItemAdapter
from ssqSpider.items import SsqspiderItem
class SsqspiderPipeline:
def __init__(self):
self.wb=openpyxl.Workbook()
self.ws=self.wb.active
self.ws.title='双色球开奖信息'
self.ws.append(['期号','日期','红球','篮球','销售额','奖池'])
def close_spider(self,spider):
self.wb.save('双色球开奖信息.xlsx')
def process_item(self, item, spider):
if not isinstance(spider,SsqSpider):
return item
qihao=item.get('qihao','')
riqi=item.get('riqi','')
kaijianghaoma_red=item.get('kaijianghaoma_red','')
kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
jiangchijiner=item.get('jiangchijiner','')
xiaoshouer=item.get('xiaoshouer','')
self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])
return item
修改pipeline配置启用ssqSpider.pipelines.SsqspiderPipeline
ITEM_PIPELINES = {
"ssqSpider.pipelines.SsqspiderPipeline": 300,
}
运行代码
scrapy crawl ssq
创建数据库spider_db或者ssq_db
创建表ssq_info
use `spider_db`;
drop table if EXISTS `ssq_info`;
create table `ssq_info`(
`id` int unsigned auto_increment COMMENT '编号',
`qihao` VARCHAR(50) not null COMMENT '期号',
`riqi` VARCHAR(50) COMMENT '日期',
`kaijianghaoma_red` VARCHAR(50) COMMENT '红球',
`kaijianghaoma_blue` VARCHAR(50) COMMENT '蓝球',
`jiangchi` DECIMAL(12,2) COMMENT '奖池',
`xiaoshouer` DECIMAL(12,2) COMMENT '销售额',
PRIMARY key(`id`)
) ENGINE=INNODB COMMENT='双色球历史信息';
使用
#安装pymysql
pip install pymysql
#创建新的处理管道,有批量和非批量插入,选择一种即可
import openpyxl
import pymysql
from itemadapter import ItemAdapter
class SsqspiderPipeline:
def __init__(self):
self.wb=openpyxl.Workbook()
self.ws=self.wb.active
self.ws.title='双色球开奖信息'
self.ws.append(['期号','日期','红球','篮球','销售额','奖池'])
def close_spider(self,spider):
self.wb.save('双色球开奖信息.xlsx')
def process_item(self, item, spider):
qihao=item.get('qihao','')
riqi=item.get('riqi','')
kaijianghaoma_red=item.get('kaijianghaoma_red','')
kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
jiangchijiner=item.get('jiangchijiner','')
xiaoshouer=item.get('xiaoshouer','')
self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])
return item
class SsqMysqlDbPipeline:
def __init__(self):
self.conn=pymysql.connect(host='localhost',port=3306,
user='root',password='123456',database='spider_db',
charset='utf8mb4')
self.curson=self.conn.cursor()
def close_spider(self,spider):
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):
qihao=item.get('qihao','')
riqi=item.get('riqi','')
kaijianghaoma_red=item.get('kaijianghaoma_red','')
kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
jiangchi=item.get('jiangchijiner',0)
xiaoshouer=item.get('xiaoshouer',0)
self.curson.execute('insert into ssq_info(qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer) values(%s,%s,%s,%s,%s,%s)',
(qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer))
# self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])
return item
class SsqMysqlDbBatchPipeline:
def __init__(self):
self.conn=pymysql.connect(host='localhost',port=3306,
user='root',password='123456',database='spider_db',
charset='utf8mb4')
self.curson=self.conn.cursor()
self.data=[]
def close_spider(self,spider):
if len(self.data)>0:
self._write_to_db()
self.conn.close()
def process_item(self, item, spider):
qihao=item.get('qihao','')
riqi=item.get('riqi','')
kaijianghaoma_red=item.get('kaijianghaoma_red','')
kaijianghaoma_blue=item.get('kaijianghaoma_blue','')
jiangchi=item.get('jiangchijiner',0)
xiaoshouer=item.get('xiaoshouer',0)
self.data.append((qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer))
if len(self.data)==30:
self._write_to_db()
self.data.clear()
# self.ws.append([qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,xiaoshouer,jiangchijiner])
return item
def _write_to_db(self):
self.curson.executemany('insert into ssq_info(qihao,riqi,kaijianghaoma_red,kaijianghaoma_blue,jiangchi,xiaoshouer) values(%s,%s,%s,%s,%s,%s)',self.data)
self.conn.commit()
修改pipeline配置
ITEM_PIPELINES = {
# "ssqSpider.pipelines.SsqspiderPipeline": 300,
# "ssqSpider.pipelines.SsqMysqlDbPipeline":300,
"ssqSpider.pipelines.SsqMysqlDbBatchPipeline":300
}