python建模全步骤

文章目录

  • 引入包名
  • 提取Object变量
  • 缺失值处理
    • 查看
    • numeric
  • 特征工程
    • 下采样
    • 正则
    • map 函数
    • object
  • object编码化
    • 热编码
    • 热编码Not sparse
    • label_encoder
    • 辨析
  • 数据分类
  • 合并数据
  • 建模
  • 决策树可视化展示
  • 特征重要性
  • 均衡样本
  • 模型评价
    • train test split
    • 模型评价
    • ROC
    • oob
  • 混淆矩阵confusion matrix
    • 混淆矩阵标准化
  • 成本矩阵cost matrix

引入包名

import matplotlib.pyplot as plt
import numpy as np
import os.path
from sklearn.preprocessing import  Imputer
import csv
import pandas as pd
import warnings
import seaborn as sns
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import pydotplus
from IPython.display import Image
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly_express as px
from sklearn import preprocessing

提取Object变量

cat_cols = [col for col in X.columns.values if X[col].dtype == 'O']

缺失值处理

查看

missing_values_table(X)

numeric

num = X.drop(cat_cols,axis=1).fillna(X.mean())

num = X.drop(cat_cols,axis=1).fillna(X.median())

num = X.drop(cat_cols,axis=1).fillna(X.mode())

特征工程

下采样

sub_sample

def lower_sample_data(df, class_):
    '''
    percent:多数类别下采样的数量相对于少数类别样本数量的比例
    '''
    data0 = df[df['retention'] == class_]  # 将少数类别的样本放在data0
    data1 = df[df['retention'] != class_] 
    index = np.random.randint(
        len(data1), size= (len(df) - len(data1)))  # 随机给定下采样取出样本的序号
    lower_data1 = data1.iloc[list(index)]  # 下采样
    return(pd.concat([lower_data1, data0]))
data = lower_sample_data(data,'lost')
data['retention'].value_counts()

正则

# 只取数字
data['brand_version'] = data['brand'].apply(lambda x:re.findall(r'\d',x)[0] if re.findall(r'\d',x) else 'null') 
data['brand_version'] = data['brand_version'].apply(lambda x:int(x) if x!='null' else 'null')
# 分类
data['brand_class'] = data['brand'].apply(lambda x:'小米' if x.find('小米') else('红米' if x.find('红米') else 'others') ) 
# 只取英文
uncn = re.compile(r'[\u0061-\u007a,\u0020]')
data['brand_series'] = data['brand'].apply(lambda x:"".join(uncn.findall(x.lower()))) 
# 只取英文和数字
data['brand_detail'] = data['brand'].apply(lambda x:re.sub('[^\u0061-\u007a^a-z^A-Z^0-9]+', '', x)) 

map 函数

def price_map(x):
    if x=='0-600':
        y=1
    elif x=='600-1000':
        y=2
    elif x=='1000-1500':
        y=3
    elif x=='1500-2000':
        y=4
    elif x=='2000-3000':
        y=5
    elif x=='3000-4000':
        y=6
    else:
        y=7
    return y

data['price_band'] = data['price'].apply(lambda x:price_map(x))

object

X = X.fillna('missing')

object编码化

热编码

热编码Not sparse

label_encoder

le = preprocessing.LabelEncoder()
for col in cat_cols:
    cat_labelcoder[col] = le.fit_transform(cat_labelcoder[col].astype('str'))

辨析

理论上,将object变量进行label_encoder或者one_hot encoder都是一样的,但是因为label encoder会将object赋予大小含义,切割特征时会按照numeric型变量进行切分;因此,如果每次赋值不同,那么每次决策树的左右子树的值就会不同,会导致结果不一致。
因此,一般而言,除了表示“不好,一般,好,很好”这种带有赋值含义的object型变量可以根据label_encoder进行数据处理,其他情况请都用one_hot。

数据分类

x = data.drop(['id','retention'],axis=1)

y = pd.DataFrame(data['retention'].apply(lambda x:1 if x=='lost' else 0))

合并数据

x_labelcoder = pd.concat([num,cat_labelcoder],axis=1)

建模

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0)
#拟合模型
clf.fit(x_onehot, y)

决策树可视化展示

clf = tree.DecisionTreeClassifier(min_samples_split=0.1,max_depth=int(np.log2(x_onehot.shape[1])),random_state=0,class_weight='balanced')
#拟合模
clf.fit(x_onehot, y)
# extract single tree
dot_data = tree.export_graphviz(clf, out_file=None,
                         feature_names=x_onehot.columns,
                         ### 重点!!!
                         class_names=data['tag'].unique(),
                         filled=True, rounded=True,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
#使用ipython的终端jupyter notebook显示。
Image(graph.create_png()) 

特征重要性

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0)
#拟合模型
clf.fit(x_onehot, y)
y_importances = clf.feature_importances_
x_importances = x_onehot.columns
df = pd.DataFrame({'x':x_importances,'y':y_importances}).sort_values(by='y',ascending=False)

px.bar_polar(df[:10], r="y", theta="x", color="x", template='plotly_white',
           color_discrete_sequence=px.colors.sequential.Plotly3[-2::-1])

均衡样本

class_weight=‘balanced’

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0,class_weight='balanced')
#拟合模型
clf.fit(x_onehot, y)

模型评价

train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_onehot, y, test_size=0.33, random_state=42)

模型评价

from sklearn.metrics import classification_report

y_predict = clf.predict(X_test)

print(classification_report(y_test, y_predict))

ROC

# y_test:实际的标签, dataset_pred:预测的概率值。
fpr, tpr, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)  
#画图,只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值,通过auc()函数能计算出来  
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.show()

roc_auc_score(y_test, y_predict)

oob

clf = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=x_onehot.shape[1],bootstrap=True,random_state=0,class_weight='balanced',oob_score=True)
#拟合模型
clf.fit(x_onehot, y)

clf.oob_score_

混淆矩阵confusion matrix

ax = sns.heatmap(confusion_matrix(y_test, y_predict),cmap='Blues',annot=True,fmt='g')
plt.title('confusion matrix')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')

混淆矩阵标准化

_ = confusion_matrix(y_test, y_predict)/np.sum(confusion_matrix(y_test, y_predict))
_ = np.around(_,decimals=2)
ax = sns.heatmap(_,cmap='Blues',annot=True,fmt='g')
plt.title('confusion matrix')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')

成本矩阵cost matrix

cm = confusion_matrix(y_test, y_predict)
# 0是流失,1是活跃
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]*5
FN = cm[1][0]*2
accuracy = round((TP+TN)/(TP+TN+FP+FN),2)
recall = round(TP/(TP+FN),2)
fscore = round(accuracy*recall/(accuracy+recall),2)
cm_biz = np.vstack(([TN,FP],[FN,TP]))
cm_biz = pd.DataFrame(cm_biz)
ax = sns.heatmap(cm_biz,cmap='Blues',annot=True,fmt='g')
plt.title('cost matrix'+'\n'+'accuracy= '+str(accuracy)+'\n'+'recall= '+str(recall)+'\n'+'f_score'+str(fscore)+'\n')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')

python建模全步骤_第1张图片

你可能感兴趣的:(python)