这是一个数据平台的需求,数据平台是java的后端,数据库是clickhouse,通过对正负样本训练,得到模型,之后通过模型对全量数据进行预测扩样。
我负责python的模型训练和输出模型的混淆矩阵的参数,还有查重率,查准率,和f1值来评估模型好坏,之后将模型保存起来,数据存入到clickhouse,之后全量数据进来后通过最优model进行预测,对全量数据表进行0,1类型划分,并且标明置信度,将这些存到表中,之后就是java大哥的事情了
首先java那边给到我正负样本数据,之后我们通过这些数据进行模型的训练,得到模型的评估参数
这里是第一部分的代码
用到了autogulon自动机器学习,还有一些sklearn的函数,可以生成混淆矩阵的值和f1值,其实代码看着很多,dataframe的操作有很多。重要的还是一些关于评估参数和训练的过程中的学习,在这里给大家普及一下一些个混淆矩阵的参数意义:
混淆矩阵怎么看_分类模型评判指标--混淆矩阵_weixin_39613744的博客-CSDN博客
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from math import radians
from math import tan,atan,acos,sin,cos,asin,sqrt
from scipy.spatial.distance import pdist, squareform
#sns.set()
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset,TabularPredictor
from clickhouse_driver import Client
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, classification_report
import csv
def main():
#链接clickhouse数据库
host = '192.168.7.41'
client = Client(host=host, port=9000, database='default',user='default' ,password='ict-clickhouse-r2wy')
#读取客户正负样本表并修改列名(对应)
sql = 'select * from TMP_LOOKALIKE_TRAIN_25'
test = client.execute(sql)
table = client.execute('desc TMP_LOOKALIKE_TRAIN_25')
df = pd.DataFrame(test)
table=pd.DataFrame(table)
table=np.ravel(table.loc[:,0])
df.columns=table
#修改异常字段为空值
df.replace(['\\N',-99], np.NaN, inplace=True)
#划分训练与测试,并乱序
train,test = train_test_split(df,test_size = 0.3,shuffle =True)
train=train.drop(['BILL_NO'],axis=1)
test=test.drop(['BILL_NO'],axis=1)
#自动机器学习保存
save_path = '../automl'
excluded_model_types = ['KNN', 'NN', 'custom','FASTAI','NN_TORCH']
predictor = TabularPredictor(label='SAMPLE_TYPE',eval_metric='f1_macro',path=save_path).fit(train,excluded_model_types=excluded_model_types,
presets='best_quality', num_bag_sets = 1, num_stack_levels = 2)
# y_test = test['SAMPLE_TYPE'] # values to predict
# test_data_nolab = test.drop(columns=['SAMPLE_TYPE']) # delete label column to prove we're not cheating
# y_train = train['SAMPLE_TYPE'] #train to predict
# train_data_nolab = train.drop(columns=['SAMPLE_TYPE']) # delete label column to prove we're not cheating
#训练测试结果聚合
a1=user_model('CatBoost_BAG_L3',test,train)
a2=user_model('WeightedEnsemble_L4',test,train)
a3=user_model('XGBoost_BAG_L3',test,train)
a4=user_model('RandomForestEntr_BAG_L3',test,train)
a5=user_model('ExtraTreesEntr_BAG_L3',test,train)
a6=user_model('LightGBM_BAG_L3',test,train)
full=pd.concat([a1,a2,a3,a4,a5,a6],axis = 0)
full['MODEL']=['CatBoost','CatBoost','WeightedEnsemble','WeightedEnsemble',
'XGBoost','XGBoost','RandomForest','RandomForest',
'ExtraTrees','ExtraTrees','LightGBMt','LightGBM']
full['Type']=['test','train','test','train',
'test','train','test','train',
'test','train','test','train']
#测试结果表入库
data=[]
full.to_csv('../full.csv',index=False,header=0)
with open('../full.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
data.append(row)
client.execute('INSERT INTO TL_TEST VALUES', data,types_check=True)
def user_model(a,test,train):
predictor = TabularPredictor.load("../automl/")
y_test = test['SAMPLE_TYPE'] # values to predict
test_data_nolab = test.drop(columns=['SAMPLE_TYPE']) # delete label column to prove we're not cheating
y_train = train['SAMPLE_TYPE'] #train to predict
train_data_nolab = train.drop(columns=['SAMPLE_TYPE'])
#使用模型
y_pred = predictor.predict(test_data_nolab, model=a)
y_train_pred = predictor.predict(train_data_nolab, model=a)
#测试
Precision= '%.3f' %precision_score(y_test, y_pred,pos_label="1")#精度
Recall= '%.3f' %recall_score(y_test, y_pred,pos_label="1")#召回率
result = confusion_matrix(y_test, y_pred)
result1=np.array([Precision,Recall,('%.3f' % f1_score(y_test, y_pred,pos_label="1"))])#三个数组
result0=np.ravel(result)#混淆矩阵扁平化
result2 = np.concatenate((result0, result1)) #连接数组
result2 = np.array(result2).reshape(1,7)#行转列
#训练
aPrecision= '%.3f' %precision_score(y_train, y_train_pred,pos_label="1")#训练集精度
aRecall= '%.3f' %recall_score(y_train, y_train_pred,pos_label="1")#训练集召回率
Aresult = confusion_matrix(y_train, y_train_pred)
Aresult1=np.array([aPrecision,aRecall,('%.3f' % f1_score(y_train, y_train_pred,pos_label="1"))])#三个数组
Aresult0=np.ravel(Aresult)#混淆矩阵扁平化
Aresult2 = np.concatenate((Aresult0, Aresult1)) #连接数组
Aresult2 = np.array(Aresult2).reshape(1,7)#行转列
data_framel=pd.DataFrame(result2,columns= ['00','01','10','11','Precision','Recall','F1'])
Adata_frame1 = pd.DataFrame(Aresult2,columns= ['00','01','10','11','Precision','Recall','F1'])#训练
sdata_frame = pd.concat([data_framel,Adata_frame1],join="inner",axis=0)
return sdata_frame
main()