一些个人平时在进行数据处理时遇到的一些操作,经常会忘记,放在这里以便后面直接访问
sklearn官方文档
皮尔森系数:比较特征与目标的相关性。(用于衡量两个变量X和Y之间的线性相关相关关系,值域在-1与1之间)
import numpy as np
pccs = np.corrcoef(x, y)
numpy array 增加一列(行)
import numpy as np
x = np.array([1,2],[3,4])
# 行
row = np.array([5,6])
x = np.row_stack((x,row))
# 列
column = np.array([3,5,7])
y = np.column_stack((x,column))
余弦相似度
from sklearn.metrics.pairwise import cosine_similarity
tmp = []
tmp.append(df.iloc[:,-1])
tmp.append(df.iloc[:,4])# wdir
tmp.append(df.iloc[:,-6])#ndir
print("wdir:%.3f ndir:%.3f"%(cosine_similarity(tmp)[0][1],cosine_similarity(tmp)[0][2]))
获取列表中前N大的索引
## 获取numn数组中的N个最大值的索引? 使用numpy中的argsort函数
import numpy as np
arr = np.array([1, 3, 2, 4, 5])
arr.argsort()[-3:][::-1]
# 返回结果
# Out[3]: array([4, 3, 1])
随机森林做特征重要性排
# 引入库
from sklearn.ensemble import RandomForestClassifier
import numpy as np
forest = RandomForestClassifier(n_estimators=50)
forest.fit(x_train, y_train) # x_train, y_train自定义
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1] # 下标排序
print(indices) #打印结果
np.argmax在多分类softmax输出后巧妙的应用
res = np.argmax(onehot, axis=-1)
分割数据集
# 随机选择数据(二八分)
seed = 1234
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
random_state=seed, shuffle=True, stratify=y)# shuffle默认为True
PCA
from sklearn.decomposition import PCA
t_feature_num = 3
pca = PCA(n_components=t_feature_num)
df= pca.fit_transform(df)
打乱 df 中的行
from sklearn.utils import shuffle
df = shuffle(df)
归一化
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# 实例化一个转换器类
transfer = MinMaxScaler()
# 调用fit_transform
data_new = transfer.fit_transform(df)
# 将结果反归一化,还原
# data_old = transfer.inverse_transform(data_new)
Acc、Pre、Recall、F1、Roc
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
print("acc:",accuracy_score(y_test, y_predict))
print("pre:",precision_score(y_test, y_predict))#多分类加 average='micro'
print("recall:",recall_score(y_test, y_predict))
print("f1:",f1_score(y_test, y_predict))
print("roc:",roc_auc_score(y_test, y_predict))
MSE、MAE、RMSE、R2 的使用
# 衡量线性回归的MSE 、 RMSE、 MAE、r2
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print("mean_absolute_error:", mean_absolute_error(y_test, y_predict))
print("mean_squared_error:", mean_squared_error(y_test, y_predict))
print("rmse:", sqrt(mean_squared_error(y_test, y_predict)))
print("r2 score:", r2_score(y_test, y_predict))
查看sklearn中所有的模型评估指标
(可用于 RandomizedSearchCV 或者 GridSearchCV 中的参数scoring )
import sklearn
sklearn.metrics.SCORERS.keys()