用于检验数据是否符合正态性分布
# 生成正态分布的观测数据
norm_data = ss.norm.rvs(loc = 0,scale = 1,size = int(10e6)) # loc为均值,scale为标准差,size为生成数据个数,可以为元组
ss.normaltest(norm_data)
常用作检验两个样本数据之间是否有较强联系
ss.chi2_contingency([[15,95],[85,5]])
常用作比较均值是否有相异性,不要求两个样本之间数据量一致
ss.ttest_ind(ss.norm.rvs(size = 500),ss.norm.rvs(size = 1000))
常用作检验多组样本数据之间的均值是否有差异
ss.f_oneway(ss.norm.rvs(size = 5000),ss.norm.rvs(size = 10000),ss.norm.rvs(size = 5000))
from statsmodels.graphics.api import qqplot
import matplotlib.pyplot as plt
qqplot(ss.norm.rvs(size = 50))
plt.close()
# plt.show()
s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
s1.corr(s2,method = "spearman")
df = pd.DataFrame(np.array([s1,s2]).T)
df.corr()
# 一元线性回归
from sklearn.linear_model import LinearRegression as LR
from sklearn.cross_validation import train_test_split
x = np.arange(50).astype(np.float).reshape(-1,1)
y = 3*x + 2+5*np.random.random((50,1))
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8)
lr = LR()
lr.fit(x_train,y_train) # 线性拟合
y_pre = lr.predict(x) # 拟合模型进行预测
plt.scatter(x_train,y_train,color = "b")
plt.scatter(x_test,y_test,color = "y")
plt.plot(x,y_pre,color = "r")
plt.close()
lr.coef_ # 斜率
lr.intercept_ #截距
lr.score(x_test,y_test) # 决定系数
sklearn自带的PCA方法使用的是奇异值分解
from sklearn.decomposition import PCA
decom = PCA(n_components = 1)
data = np.random.random((50,2))
decom.fit(data)
decom.explained_variance_ratio_ # 降维后得到的信息量
decom.fit_transform(data) # 得到降维后的数据
def myPCA(data,n_components = 2):
from scipy import linalg # linear algbra 线性代数
data_cov = np.cov(data,rowvar = False)
data_mean = np.mean(data,axis = 0)
data_temp = data - data_mean
eig_value,eig_vector = linalg.eig(np.mat(data_cov)) # eigen为特征的、固有的意思,linalg.eig为计算特征值和特征向量的函数
eig_value_index = np.argsort(eig_value)[:-(n_components+1):-1]
eig_vector = eig_vector[:,eig_value_index]
data_decom = np.dot(data_temp,eig_vector) # np.dot和np.matmul都为矩阵乘法
return data_decom,eig_value
data = np.array([[2.5,0.5,2.2,1.9,3.1,2.3,2,1,1.5,1.1],[2.4,0.7,2.9,2.2,3,2.7,1.6,1.1,1.6,0.9]]).T
myPCA(data,n_components = 1)
import seaborn as sns
sns.barplot(data = df,x = "a",y = "b",hue = "c")
sns.barplot(list(range(len(df['a']))),df['a'].sort_values())
# 定义概率平方和函数:
def getProbSS(s):
import pandas as pd
import numpy as np
if not isinstance(s,pd.core.series.Series):
s = pd.Series(s)
return sum((pd.groupby(s,by = s).count().values/float(len(s)))**2)
# 定义GiNi系数求取函数
def getGiNi(s1,s2):
"""
其中s1为目标标注
"""
import pandas as pd
import numpy as np
dict_temp = {}
for i in range(len(s1)):
dict_temp[s1[i]] = dict_temp.get(s1[i],[]) + [s2[i]]
return 1 - sum([getProbSS(value)/float(len(value)) for value in dict_temp.values()])
s1 = ["x1","x1","x2","x2","x2","x2"]
s2 = ["y1","y1","y1","y2","y2","y2"]
getGiNi(s1,s2)
# __________离散数据相关系数的计算
s1 = ["x1","x1","x2","x2","x2","x2"]
s2 = ["y1","y1","y1","y2","y2","y2"]
# 定义计算熵的函数
def getEntropy(s):
"""
熵是度量不确定性的指标
熵趋近于0,则不确定会很小。
"""
import pandas as pd
import numpy as np
if not isinstance(s,pd.core.series.Series):
s = pd.Series(s)
prob_dist = pd.groupby(s,by = s).count().values/float(len(s))
return -(prob_dist*np.log2(prob_dist)).sum()
# 自定义计算条件熵的函数
def getCondEntropy(s1,s2):
"""
在s1分布下分别对s2计算熵
"""
import pandas as pd
import numpy as np
if not isinstance(s1,pd.core.series.Series):
s1 = pd.Series(s1)
if not isinstance(s2,pd.core.series.Series):
s2 = pd.Series(s2)
dict_temp = {}
for i in np.arange(len(s1)):
dict_temp[s1[i]] = dict_temp.get(s1[i],[]) + [s2[i]]
return sum([getEntropy(value)*float(len(value))/float(len(s1)) for value in dict_temp.values()])
# 自定义互信息即熵增益函数
def getEntropyGain(s1,s2):
"""
计算由s1分布到s2的熵增益
"""
return getEntropy(s2) - getCondEntropy(s1,s2)
# 自定义熵增益率系数
def getEntropyGainRatio(s1,s2):
return getEntropyGain(s1,s2)/getEntropy(s2)
# 自定义熵相关度函数
def getDiscreteRelation(s1,s2):
"""
计算离散变量的相关系数
"""
return getEntropyGain(s1,s2)/(getEntropy(s1)*getEntropy(s2))**0.5
getDiscreteRelation(s1,s2)
from factor_analyzer import FactorAnalyzer
class CyrusFactorAnalysis():
def __init__(self,logger=None):
self.logger = logger
self.metric_tool = CyrusMetrics(logger=self.logger)
self.plot_tool = PlotTool(self.logger)
def select_factor_nums(self,data):
self.standard_tool = StandardTool(data)
std_data = self.standard_tool.transform_x(data)
self.factor_tool = FactorAnalyzer(n_factors=data.shape[1], rotation="promax")
var = self.factor_tool.get_factor_variance()
save_to_excel()
def run_factor_analysis(self,data,n_factor=2):
self.standard_tool = StandardTool(data)
std_data = self.standard_tool.transform_x(data)
self.factor_tool = FactorAnalyzer(n_factors=n_factor, rotation="promax")
process_data = self.factor_tool.fit_transform(std_data)
factor_data = self.factor_tool.loadings_
weights = self.factor_tool.weights_
var = self.factor_tool.get_factor_variance()
save_to_excel([(pd.DataFrame(factor_data),"载荷矩阵"),(pd.DataFrame(process_data),"归因后结果"),
(pd.DataFrame(weights),"归因系数"),(pd.DataFrame(var),"方差解释性")],
path="FactorAnalysisResult_{}".format(datetime.datetime.now().strftime("%Y-%m-%d")))
def transform(self,data):
std_data = self.standard_tool.transform_x(data)
factor_data = self.factor_tool.transform(std_data)
return factor_data
def save_model(self):
save_var(self.factor_tool,path="FactorAnalysisModel_{}".format(datetime.datetime.now().strftime("%Y-%m-%d")))
by CyrusMay 2022 04 05