机器学习 训练模板,汇总多个分类器

1.数据方面:主要包含pandas操作

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(font="simhei")   #让heatmap显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']  #设置在matplotlab上的中文字体
plt.rcParams['axes.unicode_minus'] = False  #在matplotlib绘图正常显示符号
pd.set_option("display.max_rows", None)



data = pd.read_csv('train.csv')

#info、describe、head、value_counts等
print(data.shape)
print(data['label'].unique())

#######删除某列、删空、改名##########
x = data.drop('label', axis=1)
data = data.dropna()    #,axis=1就是删列
data.rename(columns={‘old_name’: ‘new_ name’})
#######求和#############
data['col3']=data[['col1','col2']].sum(axis=1)


#data=data.reset_index(drop=True) # 更新

######处理标签#############
classes = data.loc[:, 'label']    #取出所有标签
df.label= df.label.astype(str).map({'False.':0, 'True.':1})   #改变标签tf为01

#########数据相关方面可以皮尔逊相关系数、热力图展示


2.归一化、独热编码:

from sklearn.preprocessing import StandardScaler,MinMaxScaler


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

########归一化#########
std = StandardScaler()    #或改为MinMaxScaler
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)   


########文字转数值#####   最近使用有点小问题
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
train_x = data.apply(le.fit_transform)

#######独热编码########  很多种方法
##方法一##
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories='auto').fit(data_ca)
result = enc.transform(data_ca)

##方法二##
data_dummies = pd.get_dummies(data[['col','col2']])

##方法三##
y_train = np_utils.to_categorical(y_train, num_classes=10)   #针对标签;分十类

#eg.[1 0 0]
 #   --------------
 #  [[0. 1.]
 #  [1. 0.]
  # [1. 0.]]

3.模型:只针对分类问题

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score



# define scoring method
scoring = 'accuracy'

# Define models to train
names = ["Nearest Neighbors"
#          , "Gaussian Process"
         ,"Decision Tree"
         , "Random Forest"
         , "Neural Net"
#          , "AdaBoost"
         ,"Naive Bayes"
         , "SVM Linear"
         , "SVM RBF"
         , "SVM Sigmoid"]



classifiers = [
    KNeighborsClassifier(n_neighbors = 3)
#     ,GaussianProcessClassifier(1.0 * RBF(1.0))
    ,DecisionTreeClassifier(max_depth=5)
    ,RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    ,MLPClassifier(alpha=1)
#     ,AdaBoostClassifier()
    ,GaussianNB()
    ,SVC(kernel = 'linear')
    ,SVC(kernel = 'rbf')
    ,SVC(kernel = 'sigmoid')
]



models = zip(names, classifiers)


# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('Test-- ',name,': ',accuracy_score(y_test, predictions))
    print()
    print(classification_report(y_test, predictions))


4.评估:

from sklearn import metrics

###########自带的
print(clf.score(X_test,y_test))

###########交叉验证
scores = cross_val_score(clf, iris.data, iris.target, cv=5)     #cross_val_scorel来完成交叉验证
print('scores:',scores)
print("Accuracy: {:.4f} (+/- {:.4})".format(scores.mean(), scores.std() * 2))


################F1
score=metrics.f1_score(y_true=y_true,y_pred=preds,average="macro")


################混淆矩阵及可视化
from sklearn.metrics import confusion_matrix, accuracy_score

conf = confusion_matrix(test_y, preds) #混淆模型, 预测值与真实值比较
label = ["0","1"] # 这里是二分类
sns.heatmap(conf, annot = True, xticklabels=label, yticklabels=label)
plt.show()


################其他
print('准确率:', metrics.accuracy_score(y_true, y_pred))
print('类别精度:', metrics.precision_score(y_true, y_pred, average=None))  # 不求平均
print('宏平均精度:', metrics.precision_score(y_true, y_pred, average='macro'))
print('微平均召回率:', metrics.recall_score(y_true, y_pred, average='micro'))

参考链接:

sklearn.metrics中的评估方法介绍(accuracy_score, recall_score, roc_curve, roc_auc_score, confusion_matrix)_千寻~的博客-CSDN博客_accuracy_score

你可能感兴趣的:(python,机器学习)