# 加载教程中会用到的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 加载 warnings
import warnings
# 忽略 warnings
warnings.filterwarnings("ignore")
# 从csv文件中写入数据
train = pd.read_csv('bank_train_set.csv')
test = pd.read_csv('bank_test_set.csv')
print(plt.style.available) # 列出所有可用的绘图样式
plt.style.use('ggplot') # 使用“ggplot”样式
data=pd.concat([train,test])
处理类型变量
for s in ['campaign','contact','default','education','housing','job','loan','marital','month','poutcome']:
data=pd.concat([data,pd.get_dummies(data[s],prefix=s+'_')],axis=1)
data.drop(s,axis=1,inplace=True)
data.drop(columns = ['ID'], inplace = True)
取出有目标值的集合
df_train=data[data['y'].notnull()]
df_test=data[data['y'].isnull()]
knn 不同参数
neig = np.arange(1, 100)
train_accuracy = []
test_accuracy = []
# 循环K值从1到25
for i, k in enumerate(neig):
# k从1到25(不包括1、25)
knn = KNeighborsClassifier(n_neighbors=k)
# 使用KNN拟合
knn.fit(x_train,y_train)
# 训练集的准确度
train_accuracy.append(knn.score(x_train, y_train))
# 测试集的准确度
test_accuracy.append(knn.score(x_test, y_test))
# 可视化
plt.figure(figsize=[13,8])
plt.plot(neig, test_accuracy, label = 'Testing Accuracy')
plt.plot(neig, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))
knn
from sklearn.model_selection import cross_val_score
neig = np.arange(1, 100)
train_accuracy = []
test_accuracy = []
# 循环K值从1到100
for i, k in enumerate(neig):
# k从1到100(不包括1、100)
knn = KNeighborsClassifier(n_neighbors=k)
cv_result=cross_val_score(knn,x,y,cv=5,scoring='accuracy')#把数据自动分成五组,然后得到每组的准确度
# 测试集的准确度
test_accuracy.append(np.sum(cv_result)/5)
# 可视化
plt.figure(figsize=[13,8])
plt.plot(neig, test_accuracy, label = 'Testing Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
neig = np.arange(1, 100)
train_accuracy = []
test_accuracy = []
# 循环K值从1到25
for i, k in enumerate(neig):
# k从1到25(不包括1、25)
clf = RandomForestClassifier(n_estimators=k, random_state=0)
cv_result=cross_val_score(clf,x,y,cv=5,scoring='accuracy')#把数据自动分成五组,然后得到每组的准确度
# 测试集的准确度
test_accuracy.append(np.sum(cv_result)/5)
# 可视化
plt.figure(figsize=[13,8])
plt.plot(neig, test_accuracy, label = 'Testing Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
neig = np.arange(1, 50)
train_accuracy = []
test_accuracy = []
# 循环K值从1到25
for i, k in enumerate(neig):
# k从1到25(不包括1、25)
clf = RandomForestClassifier(n_estimators=4,max_depth=k,
random_state=0)
cv_result=cross_val_score(clf,x,y,cv=5,scoring='accuracy')#把数据自动分成五组,然后得到每组的准确度
# 测试集的准确度
test_accuracy.append(np.sum(cv_result)/5)
# 可视化
plt.figure(figsize=[13,8])
plt.plot(neig, test_accuracy, label = 'Testing Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))