catboost应用于含多个离散特征的数据挖掘多分类问题

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


'''
数据读取
'''
data_Path = 'dataset/first_round_training_data.csv'

dataset = pd.read_csv(data_Path)

'''
数据预处理
'''
all_attrs = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5', 'Parameter6', 'Parameter7',
             'Parameter8', 'Parameter9', 'Parameter10', 'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4',
             'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Quality_label']
unused_attrs = ['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7',
                'Attribute8', 'Attribute9', 'Attribute10']
cat_attrs = ['Parameter5', 'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10']
dataset = dataset.drop(unused_attrs, axis=1)
quality_mapping = {
           'Excellent': 1,
           'Good': 2,
           'Pass': 3,
           'Fail': 4}
dataset['Quality_label'] = dataset['Quality_label'].map(quality_mapping)


X = dataset.drop('Quality_label', axis=1)
print('X  : ', X.shape)
y = dataset['Quality_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('X_train  : ', X_train.shape)
'''
模型构建
'''
catboost_model = CatBoostClassifier(
    iterations=2000,
    od_type='Iter',
    od_wait=120,
    max_depth=8,
    learning_rate=0.02,
    l2_leaf_reg=9,
    random_seed=2019,
    metric_period=50,
    fold_len_multiplier=1.1,
    loss_function='MultiClass',
    logging_level='Verbose'
    
    )

catboost_model.fit(X_train, y_train,cat_features=cat_attrs)
y_pred = catboost_model.predict(X_test)
# 模型评价
f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)
print('f1 : ', f1)
print('accuracy : ', acc)

print('finished')


 

你可能感兴趣的:(数据挖掘,多分类问题,离散特征,数据挖掘)