# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
'''
数据读取
'''
data_Path = 'dataset/first_round_training_data.csv'
dataset = pd.read_csv(data_Path)
'''
数据预处理
'''
all_attrs = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5', 'Parameter6', 'Parameter7',
'Parameter8', 'Parameter9', 'Parameter10', 'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4',
'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Quality_label']
unused_attrs = ['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7',
'Attribute8', 'Attribute9', 'Attribute10']
cat_attrs = ['Parameter5', 'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10']
dataset = dataset.drop(unused_attrs, axis=1)
quality_mapping = {
'Excellent': 1,
'Good': 2,
'Pass': 3,
'Fail': 4}
dataset['Quality_label'] = dataset['Quality_label'].map(quality_mapping)
X = dataset.drop('Quality_label', axis=1)
print('X : ', X.shape)
y = dataset['Quality_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('X_train : ', X_train.shape)
'''
模型构建
'''
catboost_model = CatBoostClassifier(
iterations=2000,
od_type='Iter',
od_wait=120,
max_depth=8,
learning_rate=0.02,
l2_leaf_reg=9,
random_seed=2019,
metric_period=50,
fold_len_multiplier=1.1,
loss_function='MultiClass',
logging_level='Verbose'
)
catboost_model.fit(X_train, y_train,cat_features=cat_attrs)
y_pred = catboost_model.predict(X_test)
# 模型评价
f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)
print('f1 : ', f1)
print('accuracy : ', acc)
print('finished')