kaggle机器学习baselines

kaggle机器学习baselines_第1张图片

kaggle机器学习baselines_第2张图片

kaggle机器学习baselines_第3张图片

kaggle机器学习baselines_第4张图片


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss

baselines

1.数据处理

  • 数据读取
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
  • 数据预处理
df = pd.concat([train_df, test_df],axis=0).reset_index(drop=True)
train_df.head(10)

kaggle机器学习baselines_第5张图片

df.info(verbose=True, null_counts=True)

kaggle机器学习baselines_第6张图片

for i in range(10):
    df[f'ch_{i}'] = df['f_27'].str.get(i).apply(ord) - ord('A')
df.info(verbose=True, null_counts=True)

kaggle机器学习baselines_第7张图片

num_cols  =  ['f_00',
             'f_01',
             'f_02',
             'f_03',
             'f_04',
             'f_05',
             'f_06',
             'f_19',
             'f_20',
             'f_21',
             'f_22',
             'f_23',
             'f_24',
             'f_25',
             'f_26',
             'f_28',] # 连续值
cate_cols = ['f_07',
             'f_08',
             'f_09',
             'f_10',
             'f_11',
             'f_12',
             'f_13',
             'f_14',
             'f_15',
             'f_16',
             'f_17',
             'f_18',
#              'f_27',
             'f_29',
             'f_30'] + [f'ch_{i}' for i in range(10)] # 离散值
# 连续特征
for col in tqdm(num_cols):
    plt.figure(dpi=150)
    sns.distplot(df[col])

kaggle机器学习baselines_第8张图片

# 原生数据编码,进行归一化处理
for col in tqdm(num_cols):
    # min-max
#     df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    # z-score
    df[col] = (df[col] - df[col].mean()) / df[col].std()
# # sklearn 数据编码
# scaler = StandardScaler()
# # scaler = MinMaxScaler()
# df[num_cols] = scaler.fit_transform(df[num_cols])
df.describe()

kaggle机器学习baselines_第9张图片

# 离散特征
for col in tqdm(cate_cols):
    plt.figure(dpi=150)
    sns.countplot(df[col])

kaggle机器学习baselines_第10张图片

# 原生离散特征编码
for col in tqdm(cate_cols):
    map_dict = dict(zip(df[col].unique(), range(df[col].nunique())))
    df[col] = df[col].map(map_dict)
    df[f'{col}_count'] = df[col].map(df[col].value_counts())
# # sklearn 离散特征编码
# for col in tqdm(cate_cols):
#     scale = LabelEncoder()
#     scale.fit(df[col])
#     df[col] = scale.transform(df[col])

kaggle机器学习baselines_第11张图片

df.info(verbose=True, null_counts=True)

kaggle机器学习baselines_第12张图片

kaggle机器学习baselines_第13张图片

train_df = df[df['target'].notna()].reset_index(drop=True)
test_df = df[df['target'].isna()].reset_index(drop=True)
drop_feature = ['id','target', 'f_27']
feature = [x for x in train_df.columns if x not in drop_feature]
print(len(feature),feature)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5OS77lKN-1663293223185)(C:\Users\admin\AppData\Roaming\Typora\typora-user-images\image-20220914165149751.png)]

2.模型调用

  • scikit-learn经典模型使用
### sklearn一般的范式
'''
# 拟合模型
model.fit(X_train, y_train)

# 模型预测
model.predict(X_test)

# 输出概率(分类任务)
model.predict_proba(X_test)

# 获得这个模型的参数
model.get_params()
'''
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_df[feature], train_df['target'], random_state=666)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier


model = RandomForestClassifier()

model.fit(X_train, y_train)

y_valid_pre = model.predict_proba(X_valid)[:,1]

print(f'{str(model)} AUC :{roc_auc_score(y_valid, y_valid_pre)}')

print(f'{str(model)} LogLoss :{log_loss(y_valid, y_valid_pre)}')

y_pre = model.predict(test_df[feature])
  • lightgbm/xgboost/catboost模型使用

3.模型验证

  • hold out验证
  • 交叉验证
params = {'num_leaves': 60, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.1,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          # 'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量,-1表示全部线程,线程越多,运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子,可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

n_fold = 5
oof_pre = np.zeros(len(train_df))
y_pre = np.zeros(len(test_df))
kf = KFold(n_splits=n_fold)
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_df)):
    trn_data = lgb.Dataset(train_df[feature].iloc[trn_idx], label=train_df['target'].iloc[trn_idx])
    val_data = lgb.Dataset(train_df[feature].iloc[val_idx], label=train_df['target'].iloc[val_idx])

    clf = lgb.train(params,
                    trn_data,
                    100000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=50,
                    early_stopping_rounds=50)
    
    oof_pre[val_idx] = clf.predict(train_df[feature].iloc[val_idx], num_iteration=clf.best_iteration)

    y_pre += clf.predict(test_df[feature], num_iteration=clf.best_iteration) / n_fold

kaggle机器学习baselines_第14张图片

kaggle机器学习baselines_第15张图片

kaggle机器学习baselines_第16张图片

res_df = pd.DataFrame()
res_df['id'] = test_df['id']
res_df['target'] = y_pre
res_df.to_csv('/kaggle/working/baseline.csv',index=False)

你可能感兴趣的:(Kaggle,机器学习,人工智能,python,数据挖掘)