import warnings
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
warnings.filterwarnings('ignore')
这部分其实不太重要,就是想画一下各个方面的图~忽略
# 绘制图像
plt.figure(figsize=(6, 4))
xx = np.linspace(0, 1, 50)
plt.plot(xx, [2 * x * (1-x) for x in xx], label='gini')
plt.plot(xx, [4 * x * (1-x) for x in xx], label='2*gini')
plt.plot(xx, [-x * np.log2(x) - (1-x) * np.log2(1 - x)
for x in xx], label='entropy')
plt.plot(xx, [1 - max(x, 1-x) for x in xx], label='missclass')
plt.plot(xx, [2 - 2 * max(x, 1-x) for x in xx], label='2*missclass')
plt.xlabel('p+')
plt.ylabel('criterion')
plt.title('Criteria of quality as a function of p+ (binary classification)')
plt.legend()
结果如下:
from sklearn.tree import DecisionTreeClassifier
# 编写一个辅助函数,返回之后的可视化网格
def get_grid(data):
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
# 这两步要限制网格的大小
return np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
# max_depth参数限制决策树的深度,random_state是随机种子数,而做二分类任务就用交叉熵
#步骤①
clf_tree = DecisionTreeClassifier(criterion='entropy', max_depth=6,
random_state=17)
# 步骤②:训练决策树
clf_tree.fit(train_data, train_labels)
# 按理说我们可以直接predict了,但是我们要做一下可视化
xx, yy = get_grid(train_data)#xx和yy是返回的两个边界列表
#步骤③:预测
predicted = clf_tree.predict(np.c_[xx.ravel(),
yy.ravel()]).reshape(xx.shape)
# np.c_是横向拼接两个矩阵,ravel可以拉成一维数组
#下面是画图,可有可没有
plt.pcolormesh(xx, yy, predicted, cmap='autumn')
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_labels, s=100,
cmap='autumn', edgecolors='black', linewidth=1.5)
StringIO()
函数开辟一个缓存空间保存决策树,通过 export_graphviz()
函数以 DOT 格式导出决策树的 GraphViz 表示,然后将其写入 out_file 中。使用 graph_from_dot_data()
函数读入数据并通过 Image()
函数显示决策树。!pip install pydotplus # 安装必要模块
from ipywidgets import Image
from io import StringIO#缓存
import pydotplus
from sklearn.tree import export_graphviz#展示树的细节,以DOT格式导出
dot_data = StringIO()
export_graphviz(clf_tree, feature_names=['x1', 'x2'],
out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())
sklearn的决策树:DecisionTreeClassifier
主要三个参数:
n_train = 150
n_test = 1000
noise = 0.1
def f(x):
x = x.ravel()
return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)
def generate(n_samples, noise):
X = np.random.rand(n_samples) * 10 - 5
X = np.sort(X).ravel()
y = np.exp(-X ** 2) + 1.5 * np.exp(-(X - 2) ** 2) + \
np.random.normal(0.0, noise, n_samples)
X = X.reshape((n_samples, 1))
return X, y
#可以直接跳到这里,下面两行是要处理好我们要训练和测试的数据~具体什么格式根据事实情况定。
X_train, y_train = generate(n_samples=n_train, noise=noise)
X_test, y_test = generate(n_samples=n_test, noise=noise)
# 步骤①:用树模型
reg_tree = DecisionTreeRegressor(max_depth=5, random_state=17)
#步骤②:训练
reg_tree.fit(X_train, y_train)
#步骤③:预测
reg_tree_pred = reg_tree.predict(X_test)
# 下面是做了一些可视化
plt.figure(figsize=(10, 6))#这一步相当做了一个画布的样子,指定了大小
plt.plot(X_test, f(X_test), "b")#直接plt.plot,其中有两个参数需要传入,第三个参数是“B"
plt.scatter(X_train, y_train, c="b", s=20)#散点图
plt.plot(X_test, reg_tree_pred, "g", lw=2)
plt.xlim([-5, 5])
plt.title("Decision tree regressor, MSE = %.2f" %
(np.sum((y_test - reg_tree_pred) ** 2) / n_test))
plt.show()
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
#处理数据
X_train, X_holdout, y_train, y_holdout = train_test_split(df.values, y, test_size=0.3,
random_state=17)
#使用树和KNN模型,因为后续要比较一下
tree = DecisionTreeClassifier(max_depth=5, random_state=17)
knn = KNeighborsClassifier(n_neighbors=10)
#训练两个模型
tree.fit(X_train, y_train)
knn.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#分别预测和看一下得分
tree_pred = tree.predict(X_holdout)
accuracy_score(y_holdout, tree_pred)
nn_pred = knn.predict(X_holdout)
accuracy_score(y_holdout, knn_pred)
from sklearn.model_selection import GridSearchCV, cross_val_score
#以字典的形式传入参数的范围,内部是列表的格式
tree_params = {'max_depth': range(5, 7),
'max_features': range(16, 18)}
#这个GridSearchCV就是我们的网格搜索了,得到这个tree_grid之后我们可以进行很多操作
tree_grid = GridSearchCV(tree, tree_params,
cv=5, n_jobs=-1, verbose=True)
#比如可以fit训练
tree_grid.fit(X_train, y_train)
#还可以得到最佳的参数和训练集准确率均值
print(tree_grid.best_params_)
print(tree_grid.best_score_)
print(accuracy_score(y_holdout, tree_grid.predict(X_holdout))
dot_data = StringIO()
export_graphviz(tree_grid.best_estimator_, feature_names=df.columns,
out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#这里使用pipeline管道把模型封装起来用,其实不需要理解太多,我觉得就是把这俩参数放在一个管道里,处理完一个再一个的~
knn_pipe = Pipeline([('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(6, 8)}
knn_grid = GridSearchCV(knn_pipe, knn_params,
cv=5, n_jobs=-1,
verbose=True)
knn_grid.fit(X_train, y_train)
knn_grid.best_params_, knn_grid.best_score_
accuracy_score(y_holdout, knn_grid.predict(X_holdout))
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1,
random_state=17)
np.mean(cross_val_score(forest, X_train, y_train, cv=5))
forest_params = {'max_depth': range(8, 10),
'max_features': range(5, 7)}
forest_grid = GridSearchCV(forest, forest_params,
cv=5, n_jobs=-1, verbose=True)
forest_grid.fit(X_train, y_train)
print(forest_grid.best_params_, forest_grid.best_score_)
print(accuracy_score(y_holdout, forest_grid.predict(X_holdout)))
KNN的方法优劣
fig = plt.figure(figsize=(25, 15))
cols = 5
rows = np.ceil(float(data_train.shape[1]) / cols)
for i, column in enumerate(data_train.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if data_train.dtypes[column] == np.object:
data_train[column].value_counts().plot(kind="bar", axes=ax)
else:
data_train[column].hist(axes=ax)
plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2)
data_train.dtypes
看到某一项应该为int的但是实际上为object的,要修复一下
data_test['Age'] = data_test['Age'].astype(int)
# 从数据集中选择类别和连续特征变量
categorical_columns = [c for c in data_train.columns
if data_train[c].dtype.name == 'object']
numerical_columns = [c for c in data_train.columns
if data_train[c].dtype.name != 'object']
print('categorical_columns:', categorical_columns)
print('numerical_columns:', numerical_columns)
# 填充缺失数据
for c in categorical_columns:
data_train[c].fillna(data_train[c].mode(), inplace=True)
data_test[c].fillna(data_train[c].mode(), inplace=True)
for c in numerical_columns:
data_train[c].fillna(data_train[c].median(), inplace=True)
data_test[c].fillna(data_train[c].median(), inplace=True)
data_train = pd.concat([data_train[numerical_columns],
pd.get_dummies(data_train[categorical_columns])], axis=1)
data_test = pd.concat([data_test[numerical_columns],
pd.get_dummies(data_test[categorical_columns])], axis=1)
好啦,总结完了,希望对大家有用,对自己也是一个进步~~~~~