支持向量机 (SVM) |
通过核函数处理非线性分类(如RBF核) |
小样本非线性数据 |
svm.SVC |
K近邻 (K-Nearest Neighbors) |
基于局部相似性,无需训练模型 |
小数据量,特征空间均匀 |
neighbors.KNeighborsClassifier |
决策树 (Decision Tree) |
可解释性强,自动处理非线性关系 |
需可视化决策规则 |
tree.DecisionTreeClassifier |
随机森林 (Random Forest) |
集成多棵决策树,降低方差,抗过拟合 |
高维数据、非线性关系 |
ensemble.RandomForestClassifier |
梯度提升树 (Gradient Boosting) |
逐步修正残差,精度高但训练慢 |
中小规模数据,高精度需求 |
ensemble.GradientBoostingClassifier |
XGBoost/LightGBM/CatBoost |
高效梯度提升框架,支持并行和正则化 |
大规模数据、竞赛场景 |
需安装第三方库 (xgboost , lightgbm 等) |
一.支持向量机
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data=pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 初始化 SVC 模型
model = SVC(kernel='rbf')
cv = KFold(n_splits=5, random_state=10, shuffle=True)
# 参数网格
param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}
# 网格搜索
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring='accuracy', # 使用准确率作为评估指标
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))
# 可视化
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("混淆矩阵")
plt.show()
二.K近邻
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier # 导入 KNN 模型
# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 初始化 KNN 模型
model = KNeighborsClassifier()
cv = KFold(n_splits=5, random_state=10, shuffle=True)
# 参数网格
param_grid = {
'n_neighbors': [3, 5, 7, 9, 11], # 邻居数量
'weights': ['uniform', 'distance'], # 权重函数
'p': [1, 2] # 距离度量(1:曼哈顿距离,2:欧氏距离)
}
# 网格搜索
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring='accuracy', # 使用准确率作为评估指标
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()
三.决策树
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier # 导入决策树模型
# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 标准化数据(决策树不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 初始化决策树模型
model = DecisionTreeClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)
# 参数网格
param_grid = {
'max_depth': [None, 5, 10, 15, 20], # 树的最大深度
'min_samples_split': [2, 5, 10], # 内部节点再划分所需最小样本数
'min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
'criterion': ['gini', 'entropy'] # 分裂标准
}
# 网格搜索
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring='accuracy', # 使用准确率作为评估指标
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()
四.随机森林
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier # 导入随机森林模型
# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 标准化数据(随机森林不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 初始化随机森林模型
model = RandomForestClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)
# 参数网格
param_grid = {
'n_estimators': [50, 100, 200], # 树的数量
'max_depth': [None, 5, 10, 15, 20], # 树的最大深度
'min_samples_split': [2, 5, 10], # 内部节点再划分所需最小样本数
'min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
'criterion': ['gini', 'entropy'] # 分裂标准
}
# 网格搜索
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring='accuracy', # 使用准确率作为评估指标
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()
五.梯度提升
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier # 导入梯度提升模型
# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 标准化数据(梯度提升不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 初始化梯度提升模型
model = GradientBoostingClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)
# 参数网格
param_grid = {
'n_estimators': [50, 100, 200], # 树的数量
'learning_rate': [0.01, 0.1, 0.2], # 学习率
'max_depth': [3, 5, 7], # 树的最大深度
'min_samples_split': [2, 5, 10], # 内部节点再划分所需最小样本数
'min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
}
# 网格搜索
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring='accuracy', # 使用准确率作为评估指标
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()
六.XGBoost/LightGBM/CatBoost