此系列文章提炼《Python机器学习基础教程》最核心要点
1、One-Hot编码(虚拟变量)
print(data.gender.value_counts())
data_dummies=pd.get_dummies(data)
features = data_dummies.loc[:, 'feature_1':'feature_k']
X = features.values
y = data_dummies['target_1'].values
#二分类时target也会变为两列,这个时候调出一列正好是0、1变量
2、数字编码的分类变量
用OneHotEncoder或替换时转换为字符串
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
1、具体操作
bins = np.linspace(-3, 3, 11)
#创建-3到3中10个均匀分布的箱子
which_bin = np.digitize(X, bins=bins)
#记录每个数据点所属的箱子
2、特点
一个箱子内所有点,任何模型会预测相同的值
分箱特征提高线性模型性能,决策树模型准确率降低
1、使用分箱扩展连续特征
X_combined = np.hstack([X, X_binned])
reg = LinearRegression().fit(X_combined, y)
line_combined = np.hstack([line, line_binned])
reg.predict(line_combined)
X_product = np.hstack([X_binned,X* X_binned])
reg = LinearRegression().fit(X_product, y)
line_product = np.hstack([line_binned, line*line_binned])
reg.predict(line_product)
2、使用原始特征的多项式
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=10, include_bias=False)
#直到x**10的多项式,include_bias默认为False则添加恒等于一的常数特征
poly.fit(X)
X_poly = poly.transform(X)
poly.get_feature_names()
#这一行用来查看获得特征的语义
reg = LinearRegression().fit(X_poly, y)
line_poly = poly.transform(line)
reg.predict(line_poly)
log和exp函数可以帮助参数调节数据的相对比例,从而改进线性模型或神经网络的学习效果
X_train_log=np.log(X_train)
X_train_exp=np.exp(X_train)
使用:特征和目标之间存在非线性关系
1、概述
特征选择的原因:过多的特征会使得模型更为复杂
判断特征作用大小的基本策略:单变量统计、基于模型的选择、迭代选择(都是监督方法)
2、单变量统计
from sklearn.feature_selection import SelectPercentile
select = SelectPercentile(percentile=50)
#选择前50%的特征
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
mask = select.get_support()
#用get_support查看哪些变量被选中
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
#将遮罩可视化,黑色为True,白色为False
plt.xlabel("Sample index")
plt.yticks(())
3、基于模型的特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42),threshold="median")
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))
4、迭代特征选择
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),n_features_to_select=40)
select.fit(X_train, y_train)
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)
score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)
print("Test score: {:.3f}".format(score))
1、概述
交叉验证:评估泛化性能的统计学方法。数据集被多次划分,需要训练多个模型
优点:多次划分:可以看出模型对训练集选择的敏感度
缺点:增加了计算成本
目的:评估给定算法在特定数据集上训练后的泛化性能(不是一种构建可用于新数据模型的方法)
2、基本操作(k折交叉验证)
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
#cv:指定交叉验证折数
print("Cross-validation scores: {}".format(scores))
3、分层k折交叉验证(每个折中类别的比例与整个数据集比例相同)
from sklearn.model_selection import KFold
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
#shuffle:将数据打乱(可以代替分层,random_state固定以获得可重复的打乱效果)
print("Cross-validation scores:\n{}".format(cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
#留一交叉验证中,每次划分单个数据点作为测试集,非常耗时,小数据使用较好
print("Number of cv iterations: ", len(scores))
print("Mean accuracy: {:.2f}".format(scores.mean()))
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
#每次划分为训练集取样train_size个点,测试集取样test_size个点,重复n_iter次,每次迭代使用部分数据:train_size+test_size<1
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print("Cross-validation scores:\n{}".format(scores))
from sklearn.model_selection import GroupKFold
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
#每次划分中,每个组会整体出现在训练集或测试集中
print("Cross-validation scores:\n{}".format(scores))
1、概述
目的:找到一个模型提供最佳泛化性能的参数
简单的编程实现
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
score = svm.score(X_test, y_test)
2、降低参数过拟合风险——验证集
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)
score = svm.score(X_valid, y_valid)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
3、带交叉验证的网格搜索
scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
score = np.mean(scores)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
#需要一个字典指定搜索参数
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)
#仍需要划分训练测试集
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
#未使用测试集选择参数:best_score_是在训练集上交叉验证得到的,与之前全数据集得到的score不同
results = pd.DataFrame(grid_search.cv_results_)
#可考虑用热图可视化
param_grid = [{'kernel': ['rbf'],'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},{'kernel': ['linear'],'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
1、回归指标:使用默认的R^2(由所有回归器的score方法给出)
2、不平和数据集:一个类占据绝大多数
3、二分类指标
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, pred_logreg)
#这里confusion是一个2*2的数组
from sklearn.metrics import classification_report
print(classification_report(y_test, svc.predict(X_test)))
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, svc.decision_function(X_test))
plt.plot(precision, recall, label="precision recall curve")
from sklearn.metrics import average_precision_score
ap_svc = average_precision_score(y_test, svc.decision_function(X_test))
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))
plt.plot(fpr, tpr, label="ROC Curve")
from sklearn.metrics import roc_auc_score
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
4、多分类指标
f1_score(y_test, pred, average="micro")
f1_score(y_test, pred, average="macro")
5、在模型选择中使用评估指标
roc_auc = cross_val_score(SVC(), X, y == 1,scoring="roc_auc")
#分类问题默认就是‘accuracy’(准确率),可以改为‘roc_auc’(AUC)
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
#与上同理
数据划分:训练集-训练模型;验证集-选择模型与参数;测试集-模型评估
明确机器学习的最终目的:确保用于模型评估与选择的指标能够很好地替代模型的实际用途
from sklearn.pipeline import Pipeline
pipe=Pipeline([("scaler", MinMaxScaler()), ("svm",SVC())])
#创建了两个步骤,第一个叫scaler,是MinMaxScaler的实例,第二个叫svm,是SVC()的实例
pipe.fit(X_train, y_train)
#拟合管道
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))
#直接用管道的score方法进行评估
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
#为网格定义参数网络:步骤名称+双下划线+参数名称
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
#现在可以仅适用训练部分进行MinMaxScaler拟合,验证和测试集都没有变化,因此测试部分的信息不会泄露到参数搜索中
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))
1、用make_pipeline方便地创建管道
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))
区别:会自动制定步骤名称,名称在step属性中
2、利用named_steps访问步骤属性
components = pipe.named_steps["pca"].components_
3、访问网格搜索管道中的属性
print("Best estimator:\n{}".format(grid.best_estimator_))
#访问GridSearchCV找到的最佳LogisticRegression模型的系数
print("Logistic regression coefficients:\n{}".format(grid.best_estimator_.named_steps["logisticregression"].coef_))
#访问每个输入特征相关权重
1、目的:使用监督任务的输出调节预处理参数
from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
#缩放数据、计算多项式特征和岭回归放在同一个pipe里
param_grid = {'polynomialfeatures__degree': [1, 2, 3],'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
#用网格选择多项式/交互
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
2、Tips:用热图将交叉验证结果可视化
plt.matshow(grid.cv_results_['mean_test_score'].reshape(3,-1),vmin=0,cmap='viridis')
plt.xlabel('ridge__alpha')
plt.ylabel('polynomialfeatures__degree')
plt.xticks(range(len(param_grid['ridge__alpha'])),param_grid['ridge__alpha'])
plt.yticks(range(len(param_grid['polynomialfeatures__degree'])),param_grid['polynomialfeatures__degree'])
plt.colorbar()
3、查看最佳模型和对应分数
print("Best parameters: {}".format(grid.best_params_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
使用搜索网格列表,跳过步骤(如random forest不需要预处理)则设置为None
param_grid = [{'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},{'classifier': [RandomForestClassifier(n_estimators=100)],'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]
网格如此定义,其余同理