功能描述
目标:获取上交所和深交所所有股票的名称和交易信息
股票数据是进行量化交易的基础型数据,此爬虫也能为量化交易提供获得基础数据的方法
输出:保存到文件中
技术路线:requests‐bs4‐re
候选数据网站的选择
新浪股票:http://finance.sina.com.cn/stock/
百度股票:https://gupiao.baidu.com/stock/
选取原则:股票信息静态存在于HTML页面中,非js代码生成
没有Robots协议限制
选取方法:浏览器 F12,源代码查看等
选取心态:不要纠结于某个网站,多找信息源尝试
数据网站的确定
新浪股票在页面上看到的股票代码在源代码中并没有,说明很可能是由JavaScript脚本生成的;而百度股票的每一支个股的信息都写在HTML代码中
所以对于这两个网站来讲,百度股票更适合作为定向爬虫的数据来源
获取股票列表:
东方财富网:http://quote.eastmoney.com/stocklist.html
获取个股信息:
百度股票:https://gupiao.baidu.com/stock/
单个股票:https://gupiao.baidu.com/stock/sz002439.html
程序的结构设计
步骤1:从东方财富网获取股票列表
步骤2:根据股票列表逐个到百度股票获取个股信息
步骤3:将结果存储到文件
百度股票源代码中个股信息的组织形式
所以键值对,用字典类型
实例编写
为了调试方便,使用traceback库
#CrawBaiduStocksB.py
import requests
from bs4 import BeautifulSoup
import traceback
import re
import time
starttime=time.perf_counter()
def gettime():
return time.perf_counter()-starttime
def getHTMLText(url, code="utf-8"):#优化编码时间
try:
r = requests.get(url,timeout=5)
r.raise_for_status()
r.encoding = code #优化时间
return r.text
except:
return ""
def getStockList(lst, stockURL):
html = getHTMLText(stockURL, "GB2312")
soup = BeautifulSoup(html, 'lxml') #优化解析器需安装lxml
a = soup.find_all('a')#找到全部a标签返回列表
for i in a:
try:
href = i.attrs['href']#标签元素i,属性href对应的键值
lst.append(re.findall(r"[s][hz]\d{6}", href)[0])#加0,返回列表中第0个字符串,并再添加到lst列表尾
except:
continue
def getStockInfo(lst, stockURL, fpath):
count = 0
for stock in lst:
url = stockURL + stock + ".html"
html = getHTMLText(url)
try:
if html=="":
continue
infoDict = {}
soup = BeautifulSoup(html, 'lxml')#优化解析器需安装lxml
stockInfo = soup.find('div',attrs={'class':'stock-bets'})#返回标签tag
name = stockInfo.find_all(attrs={'class':'bets-name'})[0]#搜索包含特殊属性的tag,加0表示取列表中第0个tag元素
infoDict.update({'股票名称': name.text.split()[0]})#text返回标签之间的文本(包括子标签内的),split分隔后取第0个
keyList = stockInfo.find_all('dt')#返回dt标签,列表
valueList = stockInfo.find_all('dd')#返回dd标签,列表
for i in range(len(keyList)):#将dt,dd键值对存到字典
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath, 'a', encoding='utf-8') as f:#a表示追加写
f.write( str(infoDict) + '\n' )#str把字典变成字符串,写入文件一行
count = count + 1
print("\r当前进度:{:.2f}%耗时:{:.0f}s".format(count*100/len(lst),gettime()),end="")
except:
count = count + 1
print("\r当前进度:{:.2f}%耗时:{:.0f}s".format(count*100/len(lst),gettime()),end="")
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
stock_info_url = 'http://gupiao.baidu.com/stock/'
output_file = 'D:/BaiduStockInfo.txt'
slist=[]
getStockList(slist, stock_list_url)
getStockInfo(slist, stock_info_url, output_file)
main()
采用requests‐bs4‐re路线实现了股票信息爬取和存储
实现了展示爬取进程的动态滚动条
# -*- coding: utf-8 -*-
import scrapy
import re
import random
class StocksSpider(scrapy.Spider):
name = "stocks"
start_urls = ['http://quote.eastmoney.com/stocklist.html']
user_agent_list = [\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",\
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",\
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",\
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",\
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",\
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",\
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",\
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",\
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def parse(self, response):
ua = random.choice(self.user_agent_list)#随机抽取User-Agent
headers = {
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Referer':'https://gupiao.baidu.com/',
'User-Agent':ua
}#构造请求头
for href in response.css('a::attr(href)').extract():
try:
stock = re.findall(r"[s][hz]\d{6}", href)[0]
url = 'http://gupiao.baidu.com/stock/' + stock + '.html'
yield scrapy.Request(url, callback=self.parse_stock,headers=headers)
except:
continue
def parse_stock(self, response):
infoDict = {}
stockInfo = response.css('.stock-bets')
name = stockInfo.css('.bets-name').extract()[0]
keyList = stockInfo.css('dt').extract()
valueList = stockInfo.css('dd').extract()
for i in range(len(keyList)):
key = re.findall(r'>.*', keyList[i])[0][1:-5]#返回第0个匹配的字符串项,再返回该字符串的第1到倒数第5个不包含5的字符
try:
val = re.findall(r'\d+\.?.*', valueList[i])[0][0:-5]
except:
val = '--'
infoDict[key]=val
infoDict.update(
{'股票名称': re.findall('\s.*\(',name)[0].split()[0] + \
re.findall('\>.*\<', name)[0][1:-1]})
yield infoDict
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BaidustocksPipeline(object):
def process_item(self, item, spider):
return item
class BaidustocksInfoPipeline(object):
def open_spider(self, spider):
self.f = open('BaiduStockInfo.txt', 'w')
def close_spider(self, spider):
self.f.close()
def process_item(self, item, spider):
try:
line = str(dict(item)) + '\n'
self.f.write(line)
except:
pass
return item
# -*- coding: utf-8 -*-
# Scrapy settings for BaiduStocks project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'BaiduStocks'
SPIDER_MODULES = ['BaiduStocks.spiders']
NEWSPIDER_MODULE = 'BaiduStocks.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'BaiduStocks.middlewares.BaidustocksDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
总结:增加了防ban机制的agent随机调用函数,headers构造,才可成功获取数据。