银行客户认购产品预测
- 加载数据
- 合并数据
- 获取非数字的列
- 特征编码
- 去掉相关性不大的列与且分数据集
- 模型训练
- 保存为csv文件
- 其他模型
-
加载数据
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
合并数据
df =pd.concat([train, test], axis=0)
df
获取非数字的列
cat_columns = df.select_dtypes(include='O').columns
df[cat_columns]
特征编码
from sklearn.preprocessing import LabelEncoder
job_le = LabelEncoder()
df['job'] = job_le.fit_transform(df['job'])
df['job'].value_counts()
df['marital'].value_counts()
df['marital'] = df['marital'].map({'unknown': 0, 'single': 1, 'married': 2, 'divorced': 3})
df['education'].value_counts()
df['education'] = df['education'].map({'unknown': 0, 'illiterate': 1, 'basic.4y': 2, 'basic.6y': 3,\
'basic.9y': 4, 'high.school': 5, 'university.degree': 6, 'professional.course': 7})
df['housing'].value_counts()
df['housing'] = df['housing'].map({'unknown': 0, 'no': 1, 'yes': 2})
df['loan'] = df['loan'].map({'unknown': 0, 'no': 1, 'yes': 2})
df['contact'] = df['contact'].map({'cellular': 0, 'telephone': 1})
df['day_of_week'].value_counts()
df['day_of_week'] = df['day_of_week'].map({'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4})
df['poutcome'] = df['poutcome'].map({'nonexistent': 0, 'failure': 1, 'success': 2})
df['default'].value_counts()
df['default'] = df['default'].map({'unknown': 0, 'no': 1, 'yes': 2})
df['month'] = df['month'].map({'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, \
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
df['subscribe'] = df['subscribe'].map({'no': 0, 'yes': 1})
df
去掉相关性不大的列与且分数据集
y_id = test['id']
df.drop(['id'], axis=1, inplace=True)
train = df[df['subscribe'].notnull()]
test = df[df['subscribe'].isnull()]
模型训练
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier(
num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2022,
n_estimators=2000, subsample=1, colsample_bytree=1,
)
model_lgb.fit(train.drop('subscribe', axis=1), train['subscribe'])
保存为csv文件
import numpy as np
y_pred = model_lgb.predict(test.drop('subscribe', axis=1))
result = pd.DataFrame({'id':y_id, 'subscribe':y_pred.astype(np.int32)})
result_map = {
0: 'no',
1:'yes' }
result['subscribe'] = result['subscribe'].map(result_map)
result.to_csv("predict1.csv", index=False)
result['subscribe'].value_counts()
其他模型
逻辑回归
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train.drop('subscribe', axis=1), train['subscribe'])
KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(train.drop('subscribe', axis=1), train['subscribe'])
决策树
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_impurity_decrease=0.0)
model.fit(train.drop('subscribe', axis=1), train['subscribe'])
随机森林
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=5, min_samples_split=1.0,
min_samples_leaf=1, max_features='auto', bootstrap=False, oob_score=False, n_jobs=1, random_state=0,
verbose=0)
model.fit(train.drop('subscribe', axis=1), train['subscribe'])