算是记录一次完整的数据挖掘过程吧
首先我们导入一些实验中需要的包
import pandas as pd
import numpy as np
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
# 模型编码,模式选择和模型评价的包
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import metrics
#可视化
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# 设置一些可视化模式
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8
接着我们导入数据,并查看数据的信息
data_raw=pd.read_csv('data/train.csv')
# 原数据的副本
data1=data_raw.copy(deep=True)
# 数据集合,元素为data1和data_val的引用
print(data_raw.info())
data_raw.sample(10)
查看数据集的空缺情况,我们发现age
和carbin
特征缺失了比较多
print(data_raw.isnull().sum())
print("-"*30)
data_raw.describe(include = 'all')
接着,我们需要对数据进行初步的处理,填充空缺值
Age
和fare
的话我们可以使用中位数
来进行填补,embarked
用众数
进行填补
# 用中位数填充age
data1['Age'].fillna(data1['Age'].median(), inplace = True)
# 众数填充embarked
data1['Embarked'].fillna(data1['Embarked'].mode()[0], inplace = True)
# 费用用中位数填充
data1['Fare'].fillna(data1['Fare'].median(), inplace = True)
# 删掉一些无法利用的特征
drop_column = ['PassengerId','Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace = True)
print(data1.isnull().sum())
从下图我们可以确定数据已经不含有缺失值了
下图为处理完的数据
接着,为了更好的描述数据,我们要构造一系列特征,分别为:
FamilySize
:家庭成员个数IsAlone
:是否独自一人Title
:乘客的title可以是先生,小姐,夫人等分箱
,以应用一些只能处理离散特征的模型FareBin
:fare按数量分箱为4段AgeBin
:age按值分箱为5段其中title
的处理比较关键,我主要就是从name
特征中进行切分提取得到,共有(Mr
,Mrs
,Master
,Miss
,以及Misc
),这些可以体现乘客的身份地位,是很重要的特征
data1['FamilySize'] = data1 ['SibSp'] + data1['Parch'] + 1
# 是否独自一人
data1['IsAlone'] = 1
data1['IsAlone'].loc[data1['FamilySize'] > 1] = 0
# 提取称呼(先生,小姐,夫人,..)
data1['Title'] = data1['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
# qcut将该列按数量分箱为4段
data1['FareBin'] = pd.qcut(data1['Fare'], 4)
# cut将该列按值分箱为5段
data1['AgeBin'] = pd.cut(data1['Age'].astype(int), 5)
# 整理一下title属性
stat_min = 10 # 确定一个阈值,小于这个阈值的title就划分为杂项(misc)
title_names = (data1['Title'].value_counts() < stat_min)
# 将数量小于 stat_names 的title变为'misc'
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())
print("-"*30)
# 检查一下数据
data1.info()
data1.sample(10)
处理后的title
特征的统计分析如下图所示
处理后的数据如下图所示
接着,对于一些字符特征,我们要对其进行编码
# 将类别数据编码
label = LabelEncoder()
data1['Sex_Code'] = label.fit_transform(data1['Sex'])
data1['Embarked_Code'] = label.fit_transform(data1['Embarked'])
data1['Title_Code'] = label.fit_transform(data1['Title'])
data1['AgeBin_Code'] = label.fit_transform(data1['AgeBin'])
data1['FareBin_Code'] = label.fit_transform(data1['FareBin'])
接着,我打算设置两个特征集合,分别为
data_x
: 原始特征,用以保存特征的名称data_x_bin
:编码,分箱后的特征,用于算法模型的计算# 设置类别
Target = ['Survived']
#原始特征
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
data1_xy = Target + data1_x
print('Original X Y: ', data1_xy, '\n')
#离散化,特征工程之后的特征
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X Y: ', data1_xy_bin, '\n')
ok,完成了填补,编码,分箱之后的数据如下图
接着,我们可视化一下乘客的数据分布,在利用PCA将数据降维到3维之后,我们将数据plot到三维空间中
蓝点
表示存活的乘客红点
表示死亡的乘客from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
def plot_pca(num,data,label):
pca=PCA(n_components=num)
X_pca=pca.fit_transform(data)
print(pca.components_)
# 分割数据
X_failure=np.array([x for i,x in enumerate(X_pca) if label[i]==1])
X_healthy=np.array([x for i,x in enumerate(X_pca) if label[i]==0])
if num==3:
fig = plt.figure(figsize=[10,15])
ax = Axes3D(fig)
#ax.legend(loc='best')
ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
ax.scatter(X_failure[:,0], X_failure[:,1], X_failure[:,2])
ax.scatter(X_healthy[:,0], X_healthy[:,1], X_healthy[:,2])
elif num==2:
plt.figure(figsize=[10,10])
plt.scatter(X_failure[:,0],X_failure[:,1])
plt.scatter(X_healthy[:,0],X_healthy[:,1])
else:
print('i do not want to work.....')
%matplotlib inline
plot_pca(3, data1[data1_x_bin], data1['Survived'])
下图即为乘客数据的三维散点图
然后,我们按8:2
分割数据集为训练集和测试集
# 三种训练集
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x_calc], data1[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target] , random_state = 0)
print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))
train1_x_bin.head()
得到分割的数据集特征如下
Data1 Shape: (891, 19)
Train1 Shape: (668, 8)
Test1 Shape: (223, 8)
训练集共含有668
个样本,测试集共含有223
个样本,数据集共有8
个特征
我们先查看每个特征
和label
的相关性
for x in data1_x:
if data1[x].dtype != 'float64' :
print('Survival Correlation by:', x)
print(data1[[x, Target[0]]].groupby(x, as_index=False).mean())
print('-'*30, '\n')
# 交叉表
print(pd.crosstab(data1['Title'],data1[Target[0]]))
age,fare,family
特征的箱线图和直方图plt.figure(figsize=[16,12])
plt.subplot(231)
plt.boxplot(x=data1['Fare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')
plt.subplot(232)
plt.boxplot(data1['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')
plt.subplot(233)
plt.boxplot(data1['FamilySize'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')
plt.subplot(234)
plt.hist(x = [data1[data1['Survived']==1]['Fare'], data1[data1['Survived']==0]['Fare']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(235)
plt.hist(x = [data1[data1['Survived']==1]['Age'], data1[data1['Survived']==0]['Age']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(236)
plt.hist(x = [data1[data1['Survived']==1]['FamilySize'], data1[data1['Survived']==0]['FamilySize']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()
Embarked,Pclass,IsAlone
特征的柱状图和点图fig, saxis = plt.subplots(2, 3,figsize=(16,12))
sns.barplot(x = 'Embarked', y = 'Survived', data=data1, ax = saxis[0,0])
sns.barplot(x = 'Pclass', y = 'Survived', order=[1,2,3], data=data1, ax = saxis[0,1])
sns.barplot(x = 'IsAlone', y = 'Survived', order=[1,0], data=data1, ax = saxis[0,2])
sns.pointplot(x = 'FareBin', y = 'Survived', data=data1, ax = saxis[1,0])
sns.pointplot(x = 'AgeBin', y = 'Survived', data=data1, ax = saxis[1,1])
sns.pointplot(x = 'FamilySize', y = 'Survived', data=data1, ax = saxis[1,2])
Pclass-Fare,Pclass-age,Pclass-FaimilySize
的箱线图和小提琴图fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(14,12))
sns.boxplot(x = 'Pclass', y = 'Fare', hue = 'Survived', data = data1, ax = axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')
sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = data1, split = True, ax = axis2)
axis2.set_title('Pclass vs Age Survival Comparison')
sns.boxplot(x = 'Pclass', y ='FamilySize', hue = 'Survived', data = data1, ax = axis3)
axis3.set_title('Pclass vs Family Size Survival Comparison')
sex-surived
和embarked,Pclass,IsAlone
的柱状图fig, qaxis = plt.subplots(1,3,figsize=(14,12))
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=data1, ax = qaxis[0])
axis1.set_title('Sex vs Embarked Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=data1, ax = qaxis[1])
axis1.set_title('Sex vs Pclass Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'IsAlone', data=data1, ax = qaxis[2])
axis1.set_title('Sex vs IsAlone Survival Comparison')
Surived-Sex
和FamilySize,Pclass
的点图fig, (maxis1, maxis2) = plt.subplots(1, 2,figsize=(14,12))
#how does family size factor with sex & survival compare
sns.pointplot(x="FamilySize", y="Survived", hue="Sex", data=data1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis1)
#how does class factor with sex & survival compare
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)
Pclass
和Embarked
的点图e = sns.FacetGrid(data1, col = 'Embarked')
e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', ci=95.0, palette = 'deep')
e.add_legend()
age
的密度曲线图a = sns.FacetGrid( data1, hue = 'Survived', aspect=4 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0 , data1['Age'].max()))
a.add_legend()
Sex-Pclass
各种组合下Survived
的直方图h = sns.FacetGrid(data1, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()
pp = sns.pairplot(data1, hue = 'Survived', palette = 'deep', size=1.2, diag_kind = 'kde', diag_kws=dict(shade=True), plot_kws=dict(s=10) )
pp.set(xticklabels=[])
#correlation heatmap of dataset
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(14, 12))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
_ = sns.heatmap(
df.corr(),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 }
)
plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(data1)
首先初始化一系列模型,为了比较各种模型的性能,我将大部分常见的模型放入了一个MLA
中,组成了一个list
# 一堆算法模型的list
MLA = [
#Ensemble Methods
ensemble.AdaBoostClassifier(),
ensemble.BaggingClassifier(),
ensemble.ExtraTreesClassifier(),
ensemble.GradientBoostingClassifier(),
ensemble.RandomForestClassifier(),
#Gaussian Processes
gaussian_process.GaussianProcessClassifier(),
#GLM
linear_model.LogisticRegressionCV(),
linear_model.PassiveAggressiveClassifier(),
linear_model.RidgeClassifierCV(),
linear_model.SGDClassifier(),
linear_model.Perceptron(),
#Navies Bayes
naive_bayes.BernoulliNB(),
naive_bayes.GaussianNB(),
#Nearest Neighbor
neighbors.KNeighborsClassifier(),
#SVM
svm.SVC(probability=True),
svm.NuSVC(probability=True),
svm.LinearSVC(),
#Trees
tree.DecisionTreeClassifier(),
tree.ExtraTreeClassifier(),
#Discriminant Analysis
discriminant_analysis.LinearDiscriminantAnalysis(),
discriminant_analysis.QuadraticDiscriminantAnalysis(),
#xgboost:
XGBClassifier()
]
然后建立dataframe
以记录模型的表现,采用交叉验证
计算模型在训练集和测试集上的accuracy
# 交叉验证算分
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
# 创建表格来比较算法之间的不同
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
# 表示真实label
MLA_predict = data1[Target]
row_index = 0
for alg in MLA:
print('algorithm: ',alg)
#set name and parameters
MLA_name = alg.__class__.__name__
MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
# 交叉验证测试算法
# 使用离散化后的数据
cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv = cv_split, return_train_score=True)
MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
#if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
# 3σ准则
MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
# 保存每个模型的预测结果
alg.fit(data1[data1_x_bin], data1[Target])
MLA_predict[MLA_name] = alg.predict(data1[data1_x_bin])
row_index+=1
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict
查看每个模型的统计学指标,即计算每个模型的precision,recall,F1-score
for model_name in list(MLA_compare['MLA Name'].values):
print(model_name," : ")
print(metrics.classification_report(MLA_predict['Survived'], MLA_predict[model_name]))
print("-"*30)
这里只放出一部分
可视化每个分类器的准确率,由于这些模型使用的都只是默认的参数,所以并没有达到最佳的性能
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')
由于题目要求的是使用Gradient Boosting Classifier
和LogisticRegression
来对数据进行建模,所以我们选择这两个模型进行参数调整,以达到尽可能高的指标值
# GridSearchCV的超参数
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_bool = [True, False]
grid_seed = [0]
grid_param = [
[{
#GradientBoostingClassifier
'learning_rate': [.05],
'n_estimators': [300],
'max_depth': grid_max_depth, #default=3
'random_state': grid_seed
}],
[{
#LogisticRegressionCV
'fit_intercept': grid_bool,
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'random_state': grid_seed
}],
]
estimator_list=[
('gbc', ensemble.GradientBoostingClassifier()),
('lr', linear_model.LogisticRegressionCV())
]
for clf, param in zip (estimator_list, grid_param):
print('clf: ',clf[1].__class__.__name__)
best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = 'roc_auc')
best_search.fit(data1[data1_x_bin], data1[Target])
best_param = best_search.best_params_
print('The best parameter for {} is {} '.format(clf[1].__class__.__name__, best_param))
clf[1].set_params(**best_param)
经过计算,这两个模型在这份数据集上的较好参数为:
在更换了参数之后,我们使用交叉验证的方法来计算模型在训练集和测试集的accuracy
# 交叉验证算分
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
# 创建表格来比较算法之间的不同
estim_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
estim_compare = pd.DataFrame(columns = estim_columns)
# 表示真实label
estim_predict = data1[Target]
row_index = 0
for estimator in estimator_list:
alg=estimator[1]
# 设置算法模型的名字和参数
estim_name = alg.__class__.__name__
estim_compare.loc[row_index, 'MLA Name'] = estim_name
estim_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
# 交叉验证测试算法
cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv = cv_split, return_train_score=True)
estim_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
estim_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
estim_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
#if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
# 3σ准则
estim_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
# 保存每个模型的预测结果
alg.fit(data1[data1_x_bin], data1[Target])
estim_predict[estim_name] = alg.predict(data1[data1_x_bin])
row_index+=1
estim_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
模型的accuracy如下,我们可以发现GradientBoostingClassifier在测试集上的准确率比LogisicRegression要好一些
for model in estimator_list:
alg=model[1]
alg_name=alg.__class__.__name__
print(alg_name," : ")
print(metrics.classification_report(estim_predict['Survived'], estim_predict[alg_name]))
print("-"*30)
#Plot Accuracy Summary
import itertools
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(data1['Survived'], estim_predict['GradientBoostingClassifier'])
np.set_printoptions(precision=2)
class_names = ['Dead', 'Survived']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix')
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(data1['Survived'], estim_predict['LogisticRegressionCV'])
np.set_printoptions(precision=2)
class_names = ['Dead', 'Survived']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix')
总的来说,感觉收获很大