以下内容主要来自天池比赛的论坛,https://tianchi.aliyun.com/notebook-ai/home#notebookLabId=85457¬ebookType=PRIVATE&isHelp=false&operaType=5
EDA的价值主要在于熟悉数据集,了解数据集,对数据集进行验证来确定所获得数据集可以用于接下来的机器学习或者深度学习使用。
当了解了数据集之后我们下一步就是要去了解变量间的相互关系以及变量与预测值之间的存在关系。
引导数据科学从业者进行数据处理以及特征工程的步骤,使数据集的结构和特征集让接下来的预测问题更加可靠。
完成对于数据的探索性分析,并对于数据进行一些图表或者文字总结并打卡。
加载数据后,配合题目中的数据说明,简略观察数据(head()+shape),大致了解有哪些特征
Train_data.head().append(Train_data.tail())#前5行和后5行
Train_data.shape
Train_data[‘price’].value_counts()
分离label即预测值
Y_train = Train_data[‘price’]
这个区别方式适用于没有直接label coding的数据
这里不适用,需要人为根据实际含义来区分
数字特征
numeric_features = Train_data.select_dtypes(include=[np.number])
numeric_features.columns
#类型特征
categorical_features = Train_data.select_dtypes(include=[np.object])
categorical_features.columns
特征nunique分布
for cat_fea in categorical_features:
print(cat_fea + “的特征分布如下:”)
print("{}特征有个{}不同的值".format(cat_fea, Train_data[cat_fea].nunique()))
print(Train_data[cat_fea].value_counts())
特征nunique分布
for cat_fea in categorical_features:
print(cat_fea + “的特征分布如下:”)
print("{}特征有个{}不同的值".format(cat_fea, Test_data[cat_fea].nunique()))
print(Test_data[cat_fea].value_counts())
v_12_scatter_plot = pd.concat([Y_train,Train_data[‘v_12’]],axis = 1)
sns.regplot(x=‘v_12’,y = ‘price’, data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
v_8_scatter_plot = pd.concat([Y_train,Train_data[‘v_8’]],axis = 1)
sns.regplot(x=‘v_8’,y = ‘price’,data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
v_0_scatter_plot = pd.concat([Y_train,Train_data[‘v_0’]],axis = 1)
sns.regplot(x=‘v_0’,y = ‘price’,data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
power_scatter_plot = pd.concat([Y_train,Train_data[‘power’]],axis = 1)
sns.regplot(x=‘power’,y = ‘price’,data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
v_5_scatter_plot = pd.concat([Y_train,Train_data[‘v_5’]],axis = 1)
sns.regplot(x=‘v_5’,y = ‘price’,data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
v_2_scatter_plot = pd.concat([Y_train,Train_data[‘v_2’]],axis = 1)
sns.regplot(x=‘v_2’,y = ‘price’,data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6)
v_6_scatter_plot = pd.concat([Y_train,Train_data[‘v_6’]],axis = 1)
sns.regplot(x=‘v_6’,y = ‘price’,data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7)
v_1_scatter_plot = pd.concat([Y_train,Train_data[‘v_1’]],axis = 1)
sns.regplot(x=‘v_1’,y = ‘price’,data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8)
v_14_scatter_plot = pd.concat([Y_train,Train_data[‘v_14’]],axis = 1)
sns.regplot(x=‘v_14’,y = ‘price’,data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9)
v_13_scatter_plot = pd.concat([Y_train,Train_data[‘v_13’]],axis = 1)
sns.regplot(x=‘v_13’,y = ‘price’,data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10)
unique分布
for fea in categorical_features:
print(Train_data[fea].nunique())
类别特征箱形图可视化
#因为 name和 regionCode的类别太稀疏了,这里我们把不稀疏的几类画一下
categorical_features = [‘model’,
‘brand’,
‘bodyType’,
‘fuelType’,
‘gearbox’,
‘notRepairedDamage’]
for c in categorical_features:
Train_data[c] = Train_data[c].astype(‘category’)
if Train_data[c].isnull().any():
Train_data[c] = Train_data[c].cat.add_categories([‘MISSING’])
Train_data[c] = Train_data[c].fillna(‘MISSING’)
def boxplot(x, y, **kwargs):
sns.boxplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, id_vars=[‘price’], value_vars=categorical_features)
g = sns.FacetGrid(f, col=“variable”, col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, “value”, “price”)
3) 类别特征的小提琴图可视化
catg_list = categorical_features
target = ‘price’
for catg in catg_list :
sns.violinplot(x=catg, y=target, data=Train_data)
plt.show()
4) 类别特征的柱形图可视化
def bar_plot(x, y, **kwargs):
sns.barplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, id_vars=[‘price’], value_vars=categorical_features)
g = sns.FacetGrid(f, col=“variable”, col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, “value”, “price”)
5) 类别特征的每个类别频数可视化(count_plot)
def count_plot(x, **kwargs):
sns.countplot(x=x)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, value_vars=categorical_features)
g = sns.FacetGrid(f, col=“variable”, col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(count_plot, “value”)
import pandas_profiling
pfr = pandas_profiling.ProfileReport(Train_data)
pfr.to_file("./example.html")