Python-Scrapy 获取历史双色球开奖号码

Python-Scrapy 获取历史双色球开奖号码

文章目录

    • 1-创建项目
    • 2-settings文件设置
    • 3-Itrm设置
    • 4. 创建Spider
    • 5-爬取规则的编写
    • 6-pipeline.py文件的编写
    • 7-爬取
    • 8-数据统计

1-创建项目

在终端中输入创建Scrapy项目的命令:

  scrapy startproject GetBicolorNumber

2-settings文件设置

   ROBOTSTXT_OBEY = False
   DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
} # 请求头
    
    ITEM_PIPELINES = {
   'GetBicolorNumber.pipelines.GetbicolornumberPipeline': 300,
} # 保存文件所需
    
    LOG_LEVEL="WARNING" # 不想显示日志加上这个

3-Itrm设置

Item设置,设置需要爬取的数据内容,items.py

    issue = scrapy.Field()  # 旗号
    time = scrapy.Field()   # 开奖具体时间
    numbers = scrapy.Field()    # 中奖号码

4. 创建Spider

  • 创建一个Spider,终端上进入GetBicolorNumber/GetBicolorNumber/Spider
  • 输入scrapy genspider bicolor_number http://kaijiang.zhcw.com

5-爬取规则的编写

# -*- coding: utf-8 -*-
import scrapy
from ..items import GetbicolornumberItem
import time

class BicolorNumberSpider(scrapy.Spider):
    name = 'bicolor_number'
    # allowed_domains = [http://kaijiang.zhcw.com']
    handle_httpstatus_list = [404, 500] # 请求返回错误的类型
    start_urls = ['http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html']
    url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_{}.html"
    page = 1

    def parse(self, response):
        print("Crawl 第:{}页".format(self.page))
        datas_xpath = response.xpath('/html/body/table//tr')  # 数据展示区域
        item = GetbicolornumberItem() # 声明item对象
        for data in datas_xpath[2:-1]:
            issue = data.xpath("./td[1]/text()").extract_first()
            time_data = data.xpath("./td[2]/text()").extract_first()
            numbers = data.xpath("./td[3]//em/text()").extract()
            item['issue'] = issue
            item['time'] = time_data
            item['numbers'] = numbers
            # print(item)
            yield item
        self.page = self.page + 1
        next_page = self.url.format(self.page)
        # time.sleep(2)
        if self.page <=145:
            # 请求中加入errback,检查错误代码并发出替代请求。
            yield scrapy.Request(next_page, callback=self.parse,errback=self.after_404)

    def after_404(self, response):
        print(response.url)

6-pipeline.py文件的编写

文件保存为projects.json。

import codecs
import json

class GetbicolornumberPipeline(object):
    def __init__(self):
        self.file = codecs.open('projects.json', 'w+', encoding="utf-8")

    def process_item(self, item, spider):
        data = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(data)
        return item

    def spider_closed(self, spider):
        self.file.close()

7-爬取

​ 命令行进入项目的根目录scrapy crawl bicolor_number
​ 建议:创建一个start.py文件,执行此文件即可

from scrapy import cmdline
# 执行爬虫
cmdline.execute("scrapy crawl bicolor_number".split())

8-数据统计

获取历史蓝色球和红色球的出现次数

# -*- coding: utf-8 -*-
import json
import operator

def get_json(file_path):
    with open(file_path,'r',encoding='utf-8') as jf:
        josn_list = jf.readlines()
    return josn_list

def get_numbersR_dict(number_list):
    numbersR_dict = {}
    for numbers in number_list:
        for number in numbers[:-1]:
            if number in numbersR_dict.keys():
                numbersR_dict[number] += 1
            else:
                numbersR_dict[number] = 0   
    return numbersR_dict

def get_numbersB_dict(number_list):
    numbersB_dict = {}
    for numbers in number_list:
        if numbers[-1] in numbersB_dict.keys():
            numbersB_dict[numbers[-1]] += 1
        else:
            numbersB_dict[numbers[-1]] = 0   
    return numbersB_dict

def sort_dictKey(numbers_dict,sort_key):
    result = []
    for k in sort_key:
        if k not in numbers_dict.keys():
            continue
        temp = (k,numbers_dict[k])
        result.append(temp)
    return result


if __name__ == '__main__':
    file_path = r"E:\pyCharm\网络爬虫\test_scrapy\GetBicolorNumber\GetBicolorNumber\projects.json"
    json_list = get_json(file_path)
    number_list = []

    for data in range(len(json_list)):
        dict_bicolor = json.loads(json_list[data])
        number_list.append(dict_bicolor['numbers'])
        
    print("总共有:{}期双色球数据数据".format(len(number_list)))
    numbersR_dict = get_numbersR_dict(number_list)
    numbersB_dict = get_numbersB_dict(number_list)
    
    # 字典排序,排序红球出现次数  导入operator
    numbersR_v = sorted(numbersR_dict.items(),key=operator.itemgetter(1),reverse = True)
    numbersB_v = sorted(numbersB_dict.items(),key=operator.itemgetter(1),reverse = True)
    
    print("红色球出现统计数据:")
    for kv in numbersR_v:
        print(kv[0],":",kv[1])
        
    print("蓝色球出现统计数据:")
    for kv in numbersB_v:
        print(kv[0],":",kv[1])
        

你可能感兴趣的:(Python学习,python,scrapy)