银行客户认购产品预测

银行客户认购产品预测

  • 加载数据
  • 合并数据
  • 获取非数字的列
  • 特征编码
  • 去掉相关性不大的列与且分数据集
  • 模型训练
  • 保存为csv文件
  • 其他模型
    • 逻辑回归
    • KNN
    • 决策树
    • 随机森林

加载数据

import pandas as pd

# 数据加载
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

合并数据

# 训练集、测试集合并
df =pd.concat([train, test], axis=0)
df

银行客户认购产品预测_第1张图片

获取非数字的列

# 获取非数字的列
cat_columns = df.select_dtypes(include='O').columns
df[cat_columns]

银行客户认购产品预测_第2张图片

特征编码

# 特征编码
from sklearn.preprocessing import LabelEncoder

job_le = LabelEncoder()
df['job'] = job_le.fit_transform(df['job'])  # 调包特征编码job列
df['job'].value_counts()


df['marital'].value_counts()
df['marital'] = df['marital'].map({'unknown': 0, 'single': 1, 'married': 2, 'divorced': 3})


df['education'].value_counts()
df['education'] = df['education'].map({'unknown': 0, 'illiterate': 1, 'basic.4y': 2, 'basic.6y': 3,\
                    'basic.9y': 4, 'high.school': 5, 'university.degree': 6, 'professional.course': 7})


df['housing'].value_counts()
df['housing'] = df['housing'].map({'unknown': 0, 'no': 1, 'yes': 2})


df['loan'] = df['loan'].map({'unknown': 0, 'no': 1, 'yes': 2})

df['contact'] = df['contact'].map({'cellular': 0, 'telephone': 1})


df['day_of_week'].value_counts()
df['day_of_week'] = df['day_of_week'].map({'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4})


df['poutcome'] = df['poutcome'].map({'nonexistent': 0, 'failure': 1, 'success': 2})


df['default'].value_counts()
df['default'] = df['default'].map({'unknown': 0, 'no': 1, 'yes': 2})


df['month'] = df['month'].map({'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, \
                 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})

df['subscribe'] = df['subscribe'].map({'no': 0, 'yes': 1})

df

银行客户认购产品预测_第3张图片

去掉相关性不大的列与且分数据集

# 去掉相关性不大的列
y_id = test['id']   # 获取测试集的id,以便后面保存为csv文件

df.drop(['id'], axis=1, inplace=True)
# 切分数据集
train = df[df['subscribe'].notnull()]
test = df[df['subscribe'].isnull()]

模型训练

# 模型训练
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2022,
            n_estimators=2000, subsample=1, colsample_bytree=1,
        )
# 模型训练
model_lgb.fit(train.drop('subscribe', axis=1), train['subscribe'])

保存为csv文件

# 保存为csv文件
import numpy as np
y_pred = model_lgb.predict(test.drop('subscribe', axis=1))


result = pd.DataFrame({'id':y_id, 'subscribe':y_pred.astype(np.int32)})

result_map = {
           0: 'no',
           1:'yes' }

result['subscribe'] = result['subscribe'].map(result_map)
result.to_csv("predict1.csv", index=False)

result['subscribe'].value_counts()

其他模型

逻辑回归

from sklearn.linear_model import LogisticRegression 
model = LogisticRegression()  # 实例化

model.fit(train.drop('subscribe', axis=1), train['subscribe'])

KNN

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)

model.fit(train.drop('subscribe', axis=1), train['subscribe'])

决策树

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_impurity_decrease=0.0)
model.fit(train.drop('subscribe', axis=1), train['subscribe'])

随机森林

from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=5, min_samples_split=1.0,
                                min_samples_leaf=1, max_features='auto',    bootstrap=False, oob_score=False, n_jobs=1, random_state=0,
                                verbose=0)

model.fit(train.drop('subscribe', axis=1), train['subscribe'])

你可能感兴趣的:(机器学习,python,人工智能)