常用的分类预测模型:逻辑回归、KNN、朴素贝叶斯、LDA、SVC、GBDT、决策树
如果需要了解相关原理的话,可以另行百度,这里只讲解训练流程,废话不多说,直接开始讲解训练流程
train_data = pd.read_csv('training.csv', header = True)
train_data
from sklearn.preprocessing import LabelEncoder
y_encoder = LabelEncoder()
x_encoder = LabelEncoder()
x = train_data[['C', 'H', 'O', 'N', 'S', 'group', 'AImod', 'DBE', 'MZ', 'OC', 'HC', 'SC', 'NC', 'NOSC', 'DBE.C', 'DBE.O', 'location', 'sample']]
for col in ['group', 'location', 'sample']:
x[col] = x_encoder.fit_transform(x[col])
y = y_encoder.fit_transform(train_data['Type'])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.24, random_state=3)
import numpy as np
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
# 计算各项评价指标
def eval_model(y_true, y_pred, labels):
# 计算每个分类的Precision, Recall, f1, support
p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
# 计算总体的平均Precision, Recall, f1, support
tot_p = np.average(p, weights=s)
tot_r = np.average(r, weights=s)
tot_f1 = np.average(f1, weights=s)
tot_s = np.sum(s)
res1 = pd.DataFrame({
u'Label': labels,
u'Precision': p,
u'Recall': r,
u'F1': f1,
u'Support': s
})
res2 = pd.DataFrame({
u'Label': [u'总体'],
u'Precision': [tot_p],
u'Recall': [tot_r],
u'F1': [tot_f1],
u'Support': [tot_s]
})
res2.index = [999]
res = pd.concat([res1, res2])
return res[[u'Label', u'Precision', u'Recall', u'F1', u'Support']]
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# 训练
model.fit(x_train, y_train)
# 测试集测试
test_y_pred =model.predict(x_test)
# 测试结果输出
res = eval_model(y_test,test_y_pred,y_encoder.classes_)
res
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=9)
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,encoder.classes_)
res
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,encoder.classes_)
res
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA()
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,encoder.classes_)
res
from sklearn.svm import SVC
model = SVC(gamma='auto')
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,encoder.classes_)
res
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,encoder.classes_)
res
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
test_y_pred = model.predict(x_test)
res = eval_model(y_test,test_y_pred,y_encoder.classes_)
res
import dill
import pickle
model_file = os.path.join('output',u'model.pkl')
with open(model_file,'wb') as outfile:
dill.dump({
'y_decoder':y_encoder,
'x_encoder':x_encoder,
'model':model
},outfile)
import pickle
with open('./output/model.pkl','rb') as mo:
model = pickle.load(mo)
test_data = pd.read_csv('test.csv', header = True)
test = test_data[['C', 'H', 'O', 'N', 'S', 'group', 'AImod', 'DBE', 'MZ', 'OC', 'HC', 'SC', 'NC', 'NOSC', 'DBE.C', 'DBE.O', 'location', 'sample']]
for col in ['group', 'location', 'sample']:
test[col] = model['x_encoder'].fit_transform(test[col])
test = model['model'].predict(test)
result = model['decoder'].inverse_transform(test)
result