查看数据偏离程度
import numpy as np
import pylab
import scipy.stats as stats
data = np.mat([[1, 200, 105, 3, False], [2, 165, 80, 2, False],
[3, 184.5, 120, 2, False], [4, 116, 70.8, 1, False], [5, 270, 150, 4, True]])
col1 = []
for row in data:
print(row[0, 1])
col1.append(row[0, 1])
stats.probplot(col1, plot=pylab)
pylab.show()
余弦相似度:
https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249
箱线图:
https://baijiahao.baidu.com/s?id=1591167651227320027&wfr=spider&for=pc
最小二乘法
import numpy as np
from matplotlib import pyplot as plt
A = np.array([[5], [4]])
C = np.array([[4], [6]])
B = A.T.dot(C)
AA = np.linalg.inv(A.T.dot(A))
l = AA.dot(B)
P = A.dot(l)
x = np.linspace(-2, 2, 10)
x.shape = (1, 10)
xx = A.dot(x)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx[0, :], xx[1, :])
ax.plot(A[0], A[1], 'ko')
ax.plot([C[0], P[0]], [C[1], P[1]], 'r-o')
ax.plot([0, C[0]], [0, C[1]], 'm-o')
ax.axvline(x=0, color='black')
ax.axhline(y=0, color='black')
margin = 0.1
ax.text(A[0] + margin, A[1] + margin, r"A", fontsize=20)
ax.text(C[0] + margin, C[1] + margin, r"C", fontsize=20)
ax.text(P[0] + margin, P[1] + margin, r"P", fontsize=20)
ax.text(0 + margin, 0 + margin, r"O", fontsize=20)
ax.text(0 + margin, 4 + margin, r"y", fontsize=20)
ax.text(4 + margin, 0 + margin, r"x", fontsize=20)
plt.xticks(np.arange(-2, 3))
plt.yticks(np.arange(-2, 3))
ax.axis('equal')
plt.show()
最小二乘法2
x = [(2, 0, 3), (1, 0, 3), (1, 1, 3), (1,4, 2), (1, 2, 4)]
y = [5, 6, 8, 10, 11]
epsilon = 0.002
alpha = 0.02
diff = [0, 0]
max_itor = 1000
error0 = 0
error1 = 0
cnt = 0
m = len(x)
theta0 = 0
theta1 = 0
theta2 = 0
while True:
cnt += 1
for i in range(m):
diff[0] = (theta0 * x[i][0] + theta1 * x[i][1] + theta2 * x[i][2]) - y[i]
theta0 -= alpha * diff[0] * x[i][0]
theta1 -= alpha * diff[0] * x[i][1]
theta2 -= alpha * diff[0] * x[i][2]
error1 = 0
for lp in range(len(x)):
error1 += (y[lp] - (theta0 + theta1 * x[lp][1] + theta2 * x[lp][2])) ** 2 / 2
if abs(error1 - error0) < epsilon:
break
else:
error0 = error1
print('theta0 : %f, theta1 : %f, theta2 : %f, error1 : %f' % (theta0, theta1, theta2, error1))
print('Done: theta0 : %f, theta1 : %f, theta2 : %f' % (theta0, theta1, theta2))
print('迭代次数: %d' % cnt)
上面为什么要除以2?
最小二乘法3
https://www.derivative-calculator.net/
拟合
from gm.api import *
import datetime
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt
set_token("e8978d765c4822e5a85fcaa73e044065cf17b58b")
day_time, hour_and_mins = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')).split(" ")
symbol = "SZSE.000651"
data = history_n("SZSE.000651", frequency="1d", end_time=day_time, count=7, fields="close,open", df=True)
close = data["close"].values
open0 = data["open"].values
open = sm.add_constant(open0)
print('open:', open)
print('close:', close)
model = sm.OLS(close, open)
results = model.fit()
bias, weight = (results.params)
print(bias, " ", weight)
plt.plot(open0, close)
x = np.linspace(34, 37, 5)
plt.plot(x, x * weight + bias)
plt.show()
指定可变长度参数
*args * 表示将传入的非关键字参数以元组形式集合
**kwargs **表示将传入的关键字参数以字典的形式集合
for 引用index
for i, element in enumerate(args):
print("args %d-->%s" % (i, str(element)))
zip
names = ['raymond', 'rachel', 'matthew']
colors = ['red', 'green', 'blue', 'yellow']
print(zip(names, colors)) #返回是一个对象
print(list(zip(names, colors))) #list()转换为列表 [('raymond', 'red'), ('rachel', 'green'), ('matthew', 'blue')]
n = min(len(names), len(colors))
for i in range(n):
print(names[i], '--->', colors[i])
# 推荐更好的简练方法
for name, color in zip(names, colors):
print(name, '--->', color)
根据多线程和多进程分别在CPU密集型和I/O密集型任务的执行效果可知,由于python GIL限制,多线程更适合I/O密集型应用。而对于CPU密集型的应用,为了实现更好的并行性,可使用多进程方式让CPU的其他内核加入执行。
# CPU密集型任务
def count(n):
while n > 0:
n -= 1
# IO密集型任务
def count():
time.sleep(0.01)
广播
# Numpy 广播特性——标量
print(array_4x3_a+5)
range
range(4)是简写,等价于range(0, 4, 1)
np.arange(4) # 步长可以为小数
range和arange性能对比
import functools,time
# 定义测试代码执行时间的装饰器-三阶
def timeit_test(number=3, repeat=3):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
for i in range(repeat):
start = time.perf_counter()
for _ in range(number):
func(*args, **kwargs)
elapsed = (time.perf_counter() - start)
print('Time of {} used: {} '.format(i, elapsed))
return wrapper
return decorator
#测试Numpy数组和等价的Python列表性能差距
if True:
@timeit_test(number=1, repeat=1)
def list_test():
my_list = list(range(1000000))
@timeit_test(number=1, repeat=1)
def ndarray_test():
my_arr = np.arange(1000000)
list_test() # Time of 0 used: 0.04712673199999998
ndarray_test() # Time of 0 used: 0.0014547089999999985
else:
@timeit_test(number=1, repeat=1)
def list_test():
my_list = []
for num in range(1000000):
my_list.append(num * 2.0)
@timeit_test(number=1, repeat=1)
def ndarray_test():
my_arr = np.arange(1000000)
my_arr = my_arr * 2.0
list_test() # Time of 0 used: 0.15243656000000003
ndarray_test() # Time of 0 used: 0.009769811999999989
print("*****example-3.14*****")
# ones(shape, dtype=None, order='C')
array_one = np.ones(shape=(2, 4))
print(array_one)
"""
[[1. 1. 1. 1.]
[1. 1. 1. 1.]]
"""
print("*********************\n")
print("*****example-3.15*****")
# np.full(shape, fill_value, dtype=None, order=’C’)
array_full = np.full(shape=(2, 4), fill_value=10)
print(array_full)
"""
[[10 10 10 10]
[10 10 10 10]]
"""
# np.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None) 等差数列
# start指定开始值;stop指定终值;num指定元素个数;endpoint指定等差数列是否包含终值
array_linspace = np.linspace(start=0, stop=5, num=10, endpoint=False)
print(array_linspace)
np.dot(): 两个矩阵的点积
np.linalg.inv(): 即使矩阵乘法的逆矩阵
np.linalg.solve(): 求解线性矩阵方程
dataframe
df可以看成是共享同一个index的Series的集合。
s_list = pd.Series([-1.55666192,0.127451231,"str-AA",-1.37775038],
index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14'])
print(s_list) # 列表中包含多种数据类型
"""
2019-01-11 -1.55666
2019-01-12 0.127451
2019-01-13 str-AA
2019-01-14 -1.37775
dtype: object
"""
#data = dict
s_dict = pd.Series({'2019-01-11' : 0., '2019-01-12' : 1., '2019-01-13' : 2., '2019-01-14' : 3.})
print(s_dict)
"""
2019-01-11 0.0
2019-01-12 1.0
2019-01-13 2.0
2019-01-14 3.0
dtype: float64
"""
series访问
# 创建被访问对象
series_access = pd.Series([10.23, 11.24, 12.25, 13.26],
index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14'])
print(series_access)
"""
2019-01-11 10.23
2019-01-12 11.24
2019-01-13 12.25
2019-01-14 13.26
dtype: float64
"""
# 访问Series全部元素数值
print(series_access.values)
# [10.23 11.24 12.25 13.26]
# 访问Series全部索引值
print(series_access.index)
# Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object')
# 访问'2019-01-11'索引的元素值
print(series_access['2019-01-11'])
# 10.23
# 访问'2019-01-11'和'2019-01-13'索引的元素值
print(series_access[['2019-01-11', '2019-01-13']])
"""
2019-01-11 10.23
2019-01-13 12.25
dtype: float64
"""
# 访问前两个数据
print(series_access[:2])
"""
2019-01-11 10.23
2019-01-12 11.24
dtype: float64
"""
df生成
# DataFrame的生成
# 以列表组成的字典形式创建DataFrame
df_list_dict = pd.DataFrame({'Close': [1., 2., 3., 5], 'Open': [1., 2., 3., 4.]},
index=['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'])
print(df_list_dict) # 创建2行4列的表格
"""
Close Open
2019-01-11 1.0 1.0
2019-01-12 2.0 2.0
2019-01-13 3.0 3.0
2019-01-14 5.0 4.0
"""
# 以嵌套列表形式创建DataFrame
df_list_list = pd.DataFrame([[1., 2., 3., 5],[1., 2., 3., 4.]],
index=['2019-01-11', '2019-01-12'],
columns=['Close','Open','Low','High'])
print(df_list_list)
"""
Close Open Low High
2019-01-11 1.0 2.0 3.0 5.0
2019-01-12 1.0 2.0 3.0 4.0
"""
# 二维ndarray形式创建DataFrame
ndarray_data = np.zeros((2), dtype=[('Close', 'i4'),('Open', 'f4'),('Low', 'a10')]) # 整数、浮点和字符串
print(ndarray_data)
"""
[(0, 0., b'') (0, 0., b'')]
"""
ndarray_data[:] = [(1,2.,'11.2'), (2,3.,"12.3")]
df_ndarray = pd.DataFrame(data=ndarray_data, index=['2019-01-11', '2019-01-12']) # 使用默认的定列索引,也可指定列索引columns,这样最终按指定的顺序进行排列
print(df_ndarray)
"""
Close Open Low
2019-01-11 1 2.0 b'11.2'
2019-01-12 2 3.0 b'12.3'
"""
# 以Series组成的字典形式创建DataFrame
series_data = {'Close' : pd.Series([1., 2., 3.], index=['2019-01-11', '2019-01-12', '2019-01-13']),
'Open' : pd.Series([1., 2., 3., 4.], index=['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'])}
df_series = pd.DataFrame(series_data)
print(df_series)
"""
Close Open
2019-01-11 1.0 1.0
2019-01-12 2.0 2.0
2019-01-13 3.0 3.0
2019-01-14 NaN 4.0
"""
df_dict_list = pd.DataFrame([{'Close': 1, 'Open': 2}, {'Close': 5, 'Open': 10, 'High': 20}],
index=['2019-01-11', '2019-01-12'])
# 如果不指定行索引index DataFrame会自动加上行索引
print(df_dict_list)
"""
Close High Open
2019-01-11 1 NaN 2
2019-01-12 5 20.0 10
"""
df 访问
# 创建被访问DataFrame对象
series_data = {'Close' : pd.Series([10.51, 10.52, 10.53, 10.54], index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14']),
'Open' : pd.Series([12.31, 12.32, 12.33, 12.34], index=['2019-01-11', '2019-01-12','2019-01-13','2019-01-14'])}
df_access = pd.DataFrame(series_data)
print(df_access)
"""
Close Open
2019-01-11 10.51 12.31
2019-01-12 10.52 12.32
2019-01-13 10.53 12.33
2019-01-14 10.54 12.34
"""
# DataFrame的访问
print("***********************访问全部元素 某行/列元素*******************")
# 访问DataFrame全部的行索引
print(df_access.index)
# Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object')
# 访问DataFrame全部的列索引
print(df_access.columns)
# Index(['Close', 'Open'], dtype='object')
# 访问DataFrame全部的行和列索引
print(df_access.axes)
# [Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object'), Index(['Close', 'Open'], dtype='object')]
# 访问DataFrame全部元素数值
print(df_access.values)
"""
[[10.51 12.31]
[10.52 12.32]
[10.53 12.33]
[10.54 12.34]]
"""
# 访问某列内容
print(df_access['Open'])
print(df_access.Open)
"""
2019-01-11 12.31
2019-01-12 12.32
2019-01-13 12.33
2019-01-14 12.34
Name: Open, dtype: float64
"""
print(type(df_access['Open'])) # 查看列类型
#
# 访问某一行内容
print(df_access[0:1])
"""
Close Open
2019-01-11 10.51 12.31
"""
print(type(df_access[0:1])) # 查看行类型
#
print("***************************DataFrame.iloc***************************")
# 选取了'2019-01-11'行对应的'Close','Open'这两列的元素内容
print(df_access.loc[['2019-01-11',],['Close','Open']])
"""
Close Open
2019-01-11 10.51 12.31
"""
# 选取了所有的行以及列索引为'Close','Open'的元素内容
print(df_access.loc[:,['Close','Open']])
"""
Close Open
2019-01-11 10.51 12.31
2019-01-12 10.52 12.32
2019-01-13 10.53 12.33
2019-01-14 10.54 12.34
"""
# 访问到'2019-01-11'这行的元素
print(df_access.loc['2019-01-11'])
"""
Close 10.51
Open 12.31
Name: 2019-01-11, dtype: float64
"""
# 选取了前两行,第一列的元素。
print(df_access.iloc[0:2,0:1])
"""
Close
2019-01-11 10.51
2019-01-12 10.52
"""
# 选取了前两行,所有列的元素
print(df_access.iloc[0:2])
"""
Close Open
2019-01-11 10.51 12.31
2019-01-12 10.52 12.32
"""
# 除了指定某个范围方式选取外,还可自由选取行和列的位置所对应的数据元素,访问第0行和第2行,第一列和第二列的元素
print(df_access.iloc[[0,2],[0,1]])
"""
Close Open
2019-01-11 10.51 12.31
2019-01-13 10.53 12.33
"""
# 采用混合标签和位置的方式访问元素 从'Open'列索引中获取第0个和第2个元素
#print(df_access.ix[[0, 2], ['Open']])
"""
Open
2019-01-11 12.31
2019-01-13 12.33
"""
print(df_access.index[[0, 2]])
# Index(['2019-01-11', '2019-01-13'], dtype='object')
print(df_access.loc[df_access.index[[0, 2]], ['Open']])
"""
Open
2019-01-11 12.31
2019-01-13 12.33
"""
print(df_access.columns.get_indexer(['Open'])) # [1]
print(df_access.columns.get_loc('Open')) # 1
print(df_access.iloc[[0, 2], df_access.columns.get_indexer(['Open'])])
"""
Open
2019-01-11 12.31
2019-01-13 12.33
"""
print(df_access.index.get_loc('2019-01-12')) # 1
print("***************************条件表达式访问元素***************************")
print(df_access.Open > df_access.Open.mean())
"""
2019-01-11 False
2019-01-12 False
2019-01-13 True
2019-01-14 True
Name: Open, dtype: bool
"""
print(df_access[df_access.Open > df_access.Open.mean()])
"""
Close Open
2019-01-13 10.53 12.33
2019-01-14 10.54 12.34
"""
print(df_access.loc[df_access.Open > df_access.Open.mean(),'Close'])
"""
2019-01-13 10.53
2019-01-14 10.54
Name: Close, dtype: float64
"""
降升采样
print(ts_d.resample('5D', closed='left', label='left').sum()) # 左闭右开 1 - 5
print(ts_d.resample('5D', closed='right', label='right').sum()) # 左开右闭 2 - 6
ts_12h_asfreq = ts_d.resample('12H').asfreq()
print(ts_12h_asfreq)
ts_12h_ffill = ts_d.resample('12H').ffill()
print(ts_12h_ffill)
roll循环右移
print("########## deal with data #####################################################")
np.random.seed(1)#设置相同的seed 每次生成的随机数相同 便于调试
#数据data:正态分布随机数组——close
close_data = np.random.normal(loc=10.0, scale=1.0, size=1000)
print(f"close_data:\n {format(close_data[0:10])}")#打印前10行
"""
close_data:
[11.62434536 9.38824359 9.47182825 8.92703138 10.86540763 7.6984613
11.74481176 9.2387931 10.3190391 9.75062962]
"""
#数据data:open
open_data = np.roll(close_data,1)
print(f"open_data:\n {format(open_data[0:10])}")#打印前10行
"""
open_data:
[ 9.81304498 11.62434536 9.38824359 9.47182825 8.92703138 10.86540763
7.6984613 11.74481176 9.2387931 10.3190391 ]
"""
#数据data:high low
high_data = np.where((open_data > close_data),open_data,close_data)
print(f"high_data:\n {format(high_data[0:10])}")#打印前10行
date/period range
date_index =pd.date_range('2010-01-01',freq='D',periods=1000)
period_index=pd.period_range('2010-01-01',freq='D',periods=1000)
np.where
high_data = np.where((open_data > close_data),open_data,close_data)
low_data = np.where((open_data <= close_data),open_data,close_data)
apply map
df_stock_object = df_stock.applymap(lambda x:'%0.2f'%x)#保留2位小数
df_stock = df_stock.round(2)#保留2位小数
dropna/fillna
df_stock.dropna(axis=0, how='any', inplace=True)#NAN值删除
df_fillna = df_stock.fillna(method='bfill', axis=0)#NAN值填充
plot
import matplotlib.pyplot as plt
# 可视化DataFrame数据
df_visual = df_stock.loc['2010-01-01':'2012-01-01',['close']].plot(linewidth=1, figsize=(8, 6))
df_visual.set_xlabel('Time')
df_visual.set_ylabel('Close price')
df_visual.set_title('From 2010-01-01 to 2012-01-01')
df_visual.legend()
plt.show()
df合并处理
concat: 沿着一条轴(行方向/列方向)将多个对象拼接到一起
merge: 根据一个或者多个键将两个df对象横向合并
join: 根据行索引为连接键将两个df对象横向合并
df遍历
#for in遍历方式
def forin_looping(df):
df = df.assign(pct_change = 0) #采用assign新增一列
for i in np.arange(0,df.shape[0]):
df.iloc[i,df.columns.get_loc('pct_change')] = (df.iloc[i]['high'] - df.iloc[i]['low'])/df.iloc[i]['open']
return df
#iterrows()遍历方式
def iterrows_loopiter(df):
df = df.assign(pct_change=0) # 采用assign新增一列
for index,row in df.iterrows():
df.loc[index, 'pct_change'] = (row['high']-row['low'])/row['open']
return df
#apply()遍历方式
df_concat['pct_change'] = df_concat.apply(lambda row: ((row['high']-row['low'])/row['open']), axis = 1)
#Pandas series 的矢量化方式
df_concat['pct_change'] = (df_concat['high']-df_concat['low'])/df_concat['open']
#Numpy arrays的矢量化方式
df_concat['pct_change'] = (df_concat['high'].values-df_concat['low'].values)/df_concat['open'].values
forin_test() # Time of 0 used: 8.462902736
iterloop_test() # Time of 0 used: 4.0023713690000005
apply_test() # Time of 0 used: 0.25229068800000043
series_test() # Time of 0 used: 0.0036549980000000204
ndarray_test() # Time of 0 used: 0.0018982859999994162
【pandas】[2] 移动窗口rolling的理解
https://blog.csdn.net/xiezhen_zheng/article/details/82319183