'cand_pty_affiliation '
data['cand_pty_affiliation'] = data[['cand_pty_affiliation']].replace({'REP':1,'DEM':0})
首先将属性和class分离开
X = data.drop(['cand_pty_affiliation'],axis=1)
y = data['cand_pty_affiliation']
再进行one-hot变换,转成稀疏格式
X = pd.get_dummies(X,sparse=True)
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=33)
因为数据有点多,训练集就取60%,方便计算
不断地用Grid SearchCV调试参数
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':np.arange(14,18),'min_samples_split':np.arange(5,8)}
clf = GridSearchCV(estimator=RandomForestClassifier(n_estimators=186,random_state=33,n_jobs=-1),param_grid=parameters,cv=5,n_jobs=-1,scoring='roc_auc')
clf.fit(train_x,train_y)
print(clf.best_score_)
print(clf.best_params_)
得到最优的参数模型
rf = RandomForestClassifier(max_depth=17,min_samples_split=5,n_estimators=186,random_state=33,n_jobs=-1)
各个模型在之前都已经找到最优参数了
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
SEED =33 # 设置随机种子
def get_models():
"""Generate a library of base learners."""
nb = GaussianNB()
svc = SVC(C=100, probability=True)
knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(C=100, random_state=SEED)
nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
rf = RandomForestClassifier(max_depth=17,min_samples_split=5,n_estimators=186,random_state=SEED,n_jobs=-1)
models = {'svm': svc,
'knn': knn,
'naive bayes': nb,
'mlp-nn': nn,
'random forest': rf,
'gbm': gb,
'logistic': lr,
}
return models #返回字典对象,key:各个模型的名字; value: 各个调好参数的模型
def train_predict(models):
"""Fit models in list on training set and return preds"""
#np.zeros(行,列)初始化一个array对象用来存储各个模型预测的值,行就是每个样本,列则是每个模型
#np.zeros行的值 = ytest.shape[0],也就是test_y的数量; 列的值=len(model_list),也就是传进来的models对象,里有几个模型
P = np.zeros((test_y.shape[0], len(models)))
P = pd.DataFrame(P)
print("Fitting models.")
cols = list()
for i, (name, m) in enumerate(models.items()):
print("%s..." % name, end=" ", flush=False)
m.fit(train_x, train_y)
P.iloc[:, i] = m.predict_proba(xtest)[:, 1] #将模型i预测的值赋值到P中
cols.append(name) #加上列名
print("done")
P.columns = cols
print("Done.\n")
return P
获得每个基础模型的分数
def score_models(P, y):
"""Score model in prediction DF"""
print("Scoring models.")
for m in P.columns:
score = roc_auc_score(y, P.loc[:, m])
print("%-26s: %.3f" % (m, score))
print("Done.\n")
SEED=33
models = get_models()
P = train_predict(models)
score_models(P, test_y)
Scoring models.
knn : 0.838
naive bayes : 0.818
mlp-nn : 0.891
random forest : 0.904
gbm : 0.889
logistic : 0.862
Done.
使用ML-Ensemble模块,将各个模型生成的预测值最为属性,看互相的相关性热力图
# You need ML-Ensemble for this figure: you can install it with: pip install mlens
from mlens.visualization import corrmat
corrmat(P.corr(), inflate=False)
plt.show()
相关性不高,且每个模型的预测值偏高,就证明该模型可以被用来stacking
stacking_predict = P.mean(axis=1)
print("Ensemble ROC-AUC score: %.3f" % roc_auc_score(test_y, P.mean(axis=1)))
Ensemble ROC-AUC score: 0.899
from sklearn.metrics import roc_curve
def plot_roc_curve(test_y, P_base_learners, P_ensemble, labels, ens_label):
#test_y = 测试集的真实值 ;P_base_learners = 各个模型预测生成的新的属性值;
#P_ensemble = 各个模型预测值的合并预测值,此处是用平均
#labels= 各个模型名称list ; ens_label 的名称,这里命名为'ensemble'
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], 'k--') # 画出基础对角线
cm = [plt.cm.rainbow(i)
for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
for i in range(P_base_learners.shape[1]):
p = P_base_learners[:, i]
fpr, tpr, _ = roc_curve(test_y, p)
plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])
fpr, tpr, _ = roc_curve(test_y, P_ensemble)
plt.plot(fpr, tpr, label=ens_label, c=cm[0])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(frameon=False)
plt.show()
plot_roc_curve(test_y, P.values, P.mean(axis=1), list(P.columns), "ensemble")
base_learners = get_models()
from mlens.ensemble import SuperLearner
# Instantiate the ensemble with 10 folds
sl = SuperLearner(
folds=10,
random_state=SEED,
verbose=2,
backend="multiprocessing"
# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)
# Train the ensemble
sl.fit(train_x.values, train_y.values)
# Predict the test set
p_sl = sl.predict_proba(test_x.values)
print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(test_y, p_sl[:, 1]))
)