Python-Scrapy 获取历史双色球开奖号码
在终端中输入创建Scrapy项目的命令:
scrapy startproject GetBicolorNumber
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
} # 请求头
ITEM_PIPELINES = {
'GetBicolorNumber.pipelines.GetbicolornumberPipeline': 300,
} # 保存文件所需
LOG_LEVEL="WARNING" # 不想显示日志加上这个
Item设置,设置需要爬取的数据内容,items.py
issue = scrapy.Field() # 旗号
time = scrapy.Field() # 开奖具体时间
numbers = scrapy.Field() # 中奖号码
GetBicolorNumber/GetBicolorNumber/Spider
scrapy genspider bicolor_number http://kaijiang.zhcw.com
# -*- coding: utf-8 -*-
import scrapy
from ..items import GetbicolornumberItem
import time
class BicolorNumberSpider(scrapy.Spider):
name = 'bicolor_number'
# allowed_domains = [http://kaijiang.zhcw.com']
handle_httpstatus_list = [404, 500] # 请求返回错误的类型
start_urls = ['http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html']
url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_{}.html"
page = 1
def parse(self, response):
print("Crawl 第:{}页".format(self.page))
datas_xpath = response.xpath('/html/body/table//tr') # 数据展示区域
item = GetbicolornumberItem() # 声明item对象
for data in datas_xpath[2:-1]:
issue = data.xpath("./td[1]/text()").extract_first()
time_data = data.xpath("./td[2]/text()").extract_first()
numbers = data.xpath("./td[3]//em/text()").extract()
item['issue'] = issue
item['time'] = time_data
item['numbers'] = numbers
# print(item)
yield item
self.page = self.page + 1
next_page = self.url.format(self.page)
# time.sleep(2)
if self.page <=145:
# 请求中加入errback,检查错误代码并发出替代请求。
yield scrapy.Request(next_page, callback=self.parse,errback=self.after_404)
def after_404(self, response):
print(response.url)
文件保存为projects.json。
import codecs
import json
class GetbicolornumberPipeline(object):
def __init__(self):
self.file = codecs.open('projects.json', 'w+', encoding="utf-8")
def process_item(self, item, spider):
data = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(data)
return item
def spider_closed(self, spider):
self.file.close()
命令行进入项目的根目录scrapy crawl bicolor_number
建议:创建一个start.py文件,执行此文件即可
from scrapy import cmdline
# 执行爬虫
cmdline.execute("scrapy crawl bicolor_number".split())
获取历史蓝色球和红色球的出现次数
# -*- coding: utf-8 -*-
import json
import operator
def get_json(file_path):
with open(file_path,'r',encoding='utf-8') as jf:
josn_list = jf.readlines()
return josn_list
def get_numbersR_dict(number_list):
numbersR_dict = {}
for numbers in number_list:
for number in numbers[:-1]:
if number in numbersR_dict.keys():
numbersR_dict[number] += 1
else:
numbersR_dict[number] = 0
return numbersR_dict
def get_numbersB_dict(number_list):
numbersB_dict = {}
for numbers in number_list:
if numbers[-1] in numbersB_dict.keys():
numbersB_dict[numbers[-1]] += 1
else:
numbersB_dict[numbers[-1]] = 0
return numbersB_dict
def sort_dictKey(numbers_dict,sort_key):
result = []
for k in sort_key:
if k not in numbers_dict.keys():
continue
temp = (k,numbers_dict[k])
result.append(temp)
return result
if __name__ == '__main__':
file_path = r"E:\pyCharm\网络爬虫\test_scrapy\GetBicolorNumber\GetBicolorNumber\projects.json"
json_list = get_json(file_path)
number_list = []
for data in range(len(json_list)):
dict_bicolor = json.loads(json_list[data])
number_list.append(dict_bicolor['numbers'])
print("总共有:{}期双色球数据数据".format(len(number_list)))
numbersR_dict = get_numbersR_dict(number_list)
numbersB_dict = get_numbersB_dict(number_list)
# 字典排序,排序红球出现次数 导入operator
numbersR_v = sorted(numbersR_dict.items(),key=operator.itemgetter(1),reverse = True)
numbersB_v = sorted(numbersB_dict.items(),key=operator.itemgetter(1),reverse = True)
print("红色球出现统计数据:")
for kv in numbersR_v:
print(kv[0],":",kv[1])
print("蓝色球出现统计数据:")
for kv in numbersB_v:
print(kv[0],":",kv[1])