import matplotlib.pyplot as plt
import numpy as np
import os.path
from sklearn.preprocessing import Imputer
import csv
import pandas as pd
import warnings
import seaborn as sns
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import pydotplus
from IPython.display import Image
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly_express as px
from sklearn import preprocessing
cat_cols = [col for col in X.columns.values if X[col].dtype == 'O']
missing_values_table(X)
num = X.drop(cat_cols,axis=1).fillna(X.mean())
num = X.drop(cat_cols,axis=1).fillna(X.median())
num = X.drop(cat_cols,axis=1).fillna(X.mode())
sub_sample
def lower_sample_data(df, class_):
'''
percent:多数类别下采样的数量相对于少数类别样本数量的比例
'''
data0 = df[df['retention'] == class_] # 将少数类别的样本放在data0
data1 = df[df['retention'] != class_]
index = np.random.randint(
len(data1), size= (len(df) - len(data1))) # 随机给定下采样取出样本的序号
lower_data1 = data1.iloc[list(index)] # 下采样
return(pd.concat([lower_data1, data0]))
data = lower_sample_data(data,'lost')
data['retention'].value_counts()
# 只取数字
data['brand_version'] = data['brand'].apply(lambda x:re.findall(r'\d',x)[0] if re.findall(r'\d',x) else 'null')
data['brand_version'] = data['brand_version'].apply(lambda x:int(x) if x!='null' else 'null')
# 分类
data['brand_class'] = data['brand'].apply(lambda x:'小米' if x.find('小米') else('红米' if x.find('红米') else 'others') )
# 只取英文
uncn = re.compile(r'[\u0061-\u007a,\u0020]')
data['brand_series'] = data['brand'].apply(lambda x:"".join(uncn.findall(x.lower())))
# 只取英文和数字
data['brand_detail'] = data['brand'].apply(lambda x:re.sub('[^\u0061-\u007a^a-z^A-Z^0-9]+', '', x))
def price_map(x):
if x=='0-600':
y=1
elif x=='600-1000':
y=2
elif x=='1000-1500':
y=3
elif x=='1500-2000':
y=4
elif x=='2000-3000':
y=5
elif x=='3000-4000':
y=6
else:
y=7
return y
data['price_band'] = data['price'].apply(lambda x:price_map(x))
X = X.fillna('missing')
le = preprocessing.LabelEncoder()
for col in cat_cols:
cat_labelcoder[col] = le.fit_transform(cat_labelcoder[col].astype('str'))
理论上,将object变量进行label_encoder或者one_hot encoder都是一样的,但是因为label encoder会将object赋予大小含义,切割特征时会按照numeric型变量进行切分;因此,如果每次赋值不同,那么每次决策树的左右子树的值就会不同,会导致结果不一致。
因此,一般而言,除了表示“不好,一般,好,很好”这种带有赋值含义的object型变量可以根据label_encoder进行数据处理,其他情况请都用one_hot。
x = data.drop(['id','retention'],axis=1)
y = pd.DataFrame(data['retention'].apply(lambda x:1 if x=='lost' else 0))
x_labelcoder = pd.concat([num,cat_labelcoder],axis=1)
clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0)
#拟合模型
clf.fit(x_onehot, y)
clf = tree.DecisionTreeClassifier(min_samples_split=0.1,max_depth=int(np.log2(x_onehot.shape[1])),random_state=0,class_weight='balanced')
#拟合模
clf.fit(x_onehot, y)
# extract single tree
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=x_onehot.columns,
### 重点!!!
class_names=data['tag'].unique(),
filled=True, rounded=True,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
#使用ipython的终端jupyter notebook显示。
Image(graph.create_png())
clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0)
#拟合模型
clf.fit(x_onehot, y)
y_importances = clf.feature_importances_
x_importances = x_onehot.columns
df = pd.DataFrame({'x':x_importances,'y':y_importances}).sort_values(by='y',ascending=False)
px.bar_polar(df[:10], r="y", theta="x", color="x", template='plotly_white',
color_discrete_sequence=px.colors.sequential.Plotly3[-2::-1])
class_weight=‘balanced’
clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0,class_weight='balanced')
#拟合模型
clf.fit(x_onehot, y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_onehot, y, test_size=0.33, random_state=42)
from sklearn.metrics import classification_report
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
# y_test:实际的标签, dataset_pred:预测的概率值。
fpr, tpr, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)
#画图,只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值,通过auc()函数能计算出来
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.show()
roc_auc_score(y_test, y_predict)
clf = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=x_onehot.shape[1],bootstrap=True,random_state=0,class_weight='balanced',oob_score=True)
#拟合模型
clf.fit(x_onehot, y)
clf.oob_score_
ax = sns.heatmap(confusion_matrix(y_test, y_predict),cmap='Blues',annot=True,fmt='g')
plt.title('confusion matrix')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')
_ = confusion_matrix(y_test, y_predict)/np.sum(confusion_matrix(y_test, y_predict))
_ = np.around(_,decimals=2)
ax = sns.heatmap(_,cmap='Blues',annot=True,fmt='g')
plt.title('confusion matrix')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')
cm = confusion_matrix(y_test, y_predict)
# 0是流失,1是活跃
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]*5
FN = cm[1][0]*2
accuracy = round((TP+TN)/(TP+TN+FP+FN),2)
recall = round(TP/(TP+FN),2)
fscore = round(accuracy*recall/(accuracy+recall),2)
cm_biz = np.vstack(([TN,FP],[FN,TP]))
cm_biz = pd.DataFrame(cm_biz)
ax = sns.heatmap(cm_biz,cmap='Blues',annot=True,fmt='g')
plt.title('cost matrix'+'\n'+'accuracy= '+str(accuracy)+'\n'+'recall= '+str(recall)+'\n'+'f_score'+str(fscore)+'\n')
plt.ylabel('True Lable')
plt.xlabel('Predicted Lable')