数据集:diabetes.csv
参考书:《Machine Learning Mastery With Python Understand Your Data, Create Accurate Models and work Projects End-to-End》
获取链接:https://github.com/aoyinke/ML_learner
from pandas import read_csv
path = "diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(path,names=names,skiprows=1)
# 观察数据的前5行
print(data.head())
# 观察数据的维度
print(data.shape)
"""
preg plas pres skin test mass pedi age class
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
(768, 9) 768行,9列
"""
# 观测每种数据的类型
print(types)
"""
preg int64
plas int64
pres int64
skin int64
test int64
mass float64
pedi float64
age int64
class int64
"""
from pandas import set_option
set_option('display.width', 100)
set_option('precision', 3)
description = data.describe()
print(description)
class_counts = data.groupby('class').size()
print(class_counts)
"""
class
0 500
1 268
"""
from pandas import set_option,read_csv
data = read_csv(filename, names=names)
set_option('display.width', 100)
set_option('precision', 3)
correlations = data.corr(method='pearson')
print(correlations)
公式中,Sk——偏度;E——期望;μ——平均值;μ3——3阶中心矩;σ——标准差。 在一般情形下,当统计数据为右偏分布时,Sk>0,且Sk值越大,右偏程度越高;
当统计数据为左偏分布时,Sk< 0,且Sk值越小,左偏程度越高。当统计数据为对称分布时,显然有Sk= 0。
所以我们应该注意处理skew较大(绝对值)的变量
skew = data.skew()
print(skew)
"""
preg 0.901674
plas 0.173754
pres -1.843608
skin 0.109372
test 2.272251
mass -0.428982
pedi 1.919911
age 1.129597
class 0.635017
"""
# Univariate Histograms
from matplotlib.pyplot as plt
from pandas import read_csv
path = "diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(path , names=names,skiprows=1)
data.hist()
plt.show()
密度图是快速了解每个属性分布的另一种方法
data.plot(kind=✬density✬, subplots=True, layout=(3,3), sharex=False)
plt.show()
总结:
data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from pandas import read_csv
path = "diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
correlations = data.corr(method='pearson') # 得到皮尔逊相关系数
# plot correlation matrix
fig = plt.figure() # 相当于拿到一张画布
ax = fig.add_subplot(1,1,1) # 创建一个一行一列的子图
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax) # 将色彩变化条(右边那一竖着的)添加到图中
ticks = np.arange(0,9,1)
# ticks = [0 1 2 3 4 5 6 7 8] 构造一个0-8,step=1的np数组
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names) # 打上index,默认采用数字
ax.set_yticklabels(names)
plt.show()
from matplotlib.pyplot as plt
from pandas import read_csv
from pandas.tools.plotting import scatter_matrix
path = "diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
scatter_matrix(data)
plt.show()