1. 简介
近期因为新型肺炎疫情的缘故, 呆在家里想着学习一点Python数据采集与分析相关的内容,于是决定从股票数据入手,做一些简单的数据采集与分析的实战工作。在网上看了一点资料,决定使用Tushare库来完成数据采集,并通过pyecharts进行简单的数据统计和分析。
2. 数据采集
Tushare是一个用于金融数据获取与量化交易方面的Python数据内容平台,其提供的数据内容包含股票、基金、期货、债券、外汇、行业大数据,同时包括了数字货币行情等区块链数据的全数据品类的金融大数据平台,为各类金融投资和研究人员提供适用的数据和工具。
大家对Tushare感兴趣的话,可以前往Tushare官网了解相关的文档。
吐槽一下:Tushare Pro版本虽增加了许多数据接口, 但是需要一定的积分, 这个就有点让人难受, 很多功能没有积分就用不了, 还不如以前的版本好用,至少很多数据不需要积分就可以直接获取。
准备工作:
创建stockdb数据库
创建tb_stock_info数据表(数据表列名与Tushare的股票列表输出参数一致)
下面是使用Tushare进行股票基础数据采集的源代码, 使用Tushare接口时需要使用Token,大家可以自行在Tushare网站上注册:
# -*- coding: utf-8 -*-
"""
Created on 2020年2月4日
@author: garyond
"""
# 导入模块
from datetime import datetime, timedelta, time # 时间处理函数
import pymysql # mysql操作库
import pandas as pd
import numpy as np
import tushare as ts
# 全局对象
# 本地
mysqlSetting_local = {
'host': "localhost",
'port': 3306,
'user': "root",
'passwd': "",
'db': "stockdb",
'charset': 'utf8'
}
# 执行sql语句
def execSql(mysqlSetting, sql):
# 打开数据库连接
db = pymysql.connect(host=mysqlSetting['host'], port=mysqlSetting['port'], user=mysqlSetting['user'], \
passwd=mysqlSetting['passwd'], db=mysqlSetting['db'], charset=mysqlSetting['charset'])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
cursor.execute(sql)
ret = cursor.fetchall()
db.commit()
# 关闭数据库连接
db.close()
return ret
# DataFrame对象落地数据库
def saveStockData(mysqlSetting, df, tableName):
# 打开数据库连接
db = pymysql.connect(host=mysqlSetting['host'], port=mysqlSetting['port'], user=mysqlSetting['user'],\
passwd=mysqlSetting['passwd'], db=mysqlSetting['db'], charset=mysqlSetting['charset'])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# 添加操作的sql语句
sqlInsert = "REPLACE INTO " + tableName + " ("
# 获取Columns信息
for _index, column in enumerate(df.columns):
if _index < len(df.columns) - 1:
sqlInsert = sqlInsert + column + ','
else:
sqlInsert = sqlInsert + column + ') VALUES '
# 迭代DataFrame数据
for index, row in df.iterrows():
sqlInsertNew = sqlInsert + '('
for i, column in enumerate(df.columns):
if i < len(df.columns) - 1:
if pd.isnull(row[column]) or row[column] == 0:
sqlInsertNew = sqlInsertNew + 'NULL' + ','
else:
sqlInsertNew = sqlInsertNew + '\'' + str(row[column]) + '\'' + ','
# sqlInsertNew=sqlInsertNew+'\''+str(row[column])+'\''+','
else:
if pd.isnull(row[column]) or row[column] == 0:
sqlInsertNew = sqlInsertNew + 'NULL'
else:
sqlInsertNew = sqlInsertNew + '\'' + str(row[column]) + '\''
sqlInsertNew = sqlInsertNew + ');'
print(sqlInsertNew)
cursor.execute(sqlInsertNew)
db.commit()
# 关闭数据库连接
db.close()
if __name__ == '__main__':
ts.set_token('712ce0fbfe30aa3e436a961a33360520182e36ebc9a6fb0e13fc5b12')
pro = ts.pro_api()
# 获取股票数据
data = pro.stock_basic(exchange='', list_status='', fields='ts_code,symbol,name,area,industry,fullname,market,exchange,curr_type,list_status,list_date,is_hs')
# 插入数据库
saveStockData(mysqlSetting_local, data, "tb_stock_info")
3. 基础数据分析
注:以下代码使用Jupyter Notebook进行管理与记录
1. 沪深股市上市司地域分布情况
from pyecharts import options as opts
from pyecharts.charts import Map
import pandas as pd
# 定义Map地图
def map_base() -> Map:
conn = pymysql.connect(host='127.0.0.1', port=3306,\
user='root',password='', \
db='stockdb',charset='utf8', \
use_unicode=True)
sql = "select area,count(symbol) as value from tb_stock_info where exchange = 'SSE' and area is not null group by area order by count(symbol) desc"
df = pd.read_sql(sql, con=conn)
sql2 = "select area,count(symbol) as value from tb_stock_info where exchange = 'SZSE' and area is not null group by area order by count(symbol) desc"
df2 = pd.read_sql(sql2, con=conn)
c = (
Map()
.add("沪市上市公司数量", df.values, "china")
.add("深市上市公司数量", df2.values, "china")
.set_global_opts(title_opts=opts.TitleOpts(title="沪深股市地域分布图"))
)
conn.close()
return c
# 地图渲染到Jupyter NoteBook中
map_base().render_notebook()
代码执行如下图所示:
沪深股市地域分布图
2. 沪深股通标的物分布情况
from pyecharts import options as opts
from pyecharts.charts import Pie
import pandas as pd
# 生成饼图
def stat_stock_subject() -> Pie:
conn = pymysql.connect(host='127.0.0.1', port=3306,\
user='root',password='', \
db='stockdb',charset='utf8', \
use_unicode=True)
sql = "select " \
"case is_hs WHEN 'H' then '沪股通' when 'S' then '深股通' ELSE '无' END as subject," \
"count(symbol) as value " \
"from tb_stock_info " \
"where exchange in ('SZSE','SSE') " \
"group by (case is_hs WHEN 'H' then '沪股通' when 'S' then '深股通' ELSE '无' END) " \
"order by count(symbol) desc"
df = pd.read_sql(sql, con=conn)
c = (
Pie()
.add("标的物数量", df.values)
.set_global_opts(title_opts=opts.TitleOpts(title="沪深股通标的物分布情况"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
conn.close()
return c
# 饼图渲染到Jupyter NoteBook中
stat_stock_subject().render_notebook()
代码执行如下图所示:
沪深股通标的物分布情况
3. 沪深股市上市年份分布情况
from pyecharts import options as opts
from pyecharts.charts import Pie
import pandas as pd
sql = "select '小于5年' as yearRange, sum(z.value) as stockNum from (" \
" select (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) as listYear, COUNT(symbol) as value" \
" from tb_stock_info" \
" where list_date is not null and list_status = 'L' and exchange in ('SZSE', 'SSE')" \
" group by (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) " \
" having listYear between 0 and 5" \
") z" \
" union " \
"select '5 - 10年' as yearRange, sum(a.value) as stockNum from (" \
" select (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) as listYear, COUNT(symbol) as value" \
" from tb_stock_info" \
" where list_date is not null and list_status = 'L' and exchange in ('SZSE', 'SSE')" \
" group by (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) " \
" having listYear between 5 and 10" \
") a" \
" union " \
"select '10 - 20年' as yearRange, sum(b.value) as stockNum from (" \
" select (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) as listYear, COUNT(symbol) as value" \
" from tb_stock_info" \
" where list_date is not null and list_status = 'L' and exchange in ('SZSE', 'SSE')" \
" group by (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) " \
" having listYear between 10 and 20" \
") b" \
" union " \
"select '大于20年' as yearRange, sum(c.value) as stockNum from (" \
" select (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) as listYear, COUNT(symbol) as value" \
" from tb_stock_info" \
" where list_date is not null and list_status = 'L' and exchange in ('SZSE', 'SSE')" \
" group by (TIMESTAMPDIFF(MONTH,str_to_date(list_date,'%Y%m%d'), CURRENT_DATE) / 12) " \
" having listYear > 20" \
") c"
# 生成饼图数据
def stat_stock_year(sql) -> Pie:
conn = pymysql.connect(host='127.0.0.1', port=3306,\
user='root',password='', \
db='stockdb',charset='utf8', \
use_unicode=True)
df = pd.read_sql(sql, con=conn)
c = (
Pie()
.add("股票数", df.values)
.set_global_opts(title_opts=opts.TitleOpts(title="沪深股市上市年份分布情况"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
conn.close()
return c
# 进行图渲染
stat_stock_year(sql).render_notebook()
代码执行如下图所示:
沪深股市上市年份分布情况
以上只是一些个人基础的实践, 目的是用于自己学习Python相关的数据采集与分析, 固有考虑不周的地方, 后期将会进一步的完善。