import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
# 读取pokeman 前 400 条数据
df = pd.read_csv("./data/pokemon.csv",nrows=400)
all_df = pd.read_csv("./data/pokemon.csv")
# 筛选 water 和 normal 类别
water_class = df[df.type1 == 'water']
normal_class = df[df.type1 == 'normal']
all_water_class = df[df.type1 == 'water']
all_normal_class = df[df.type1 == 'normal']
# 统计 water 和 normal 类的数量
water_class_num = len(water_class)
normal_class_num = len(normal_class)
print("water class: %d "% water_class_num)
print("normal class: %d " % normal_class_num)
water_class_percent = water_class_num/(water_class_num + normal_class_num)
normal_class_percent = normal_class_num/(water_class_num + normal_class_num)
print("p(water) = %.2f" % water_class_percent)
print("p(normal) = %.2f" % normal_class_percent)
water class: 73
normal class: 60
p(water) = 0.55
p(normal) = 0.45
print(df.columns)
Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
'against_electric', 'against_fairy', 'against_fight', 'against_fire',
'against_flying', 'against_ghost', 'against_grass', 'against_ground',
'against_ice', 'against_normal', 'against_poison', 'against_psychic',
'against_rock', 'against_steel', 'against_water', 'attack',
'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
'japanese_name', 'name', 'percentage_male', 'pokedex_number',
'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
'generation', 'is_legendary'],
dtype='object')
# 将pokeman 的防御力和特殊防御力绘制如下图
plt.scatter(df.defense,df.sp_defense)
plt.xlabel("Defense")
plt.ylabel("Sp Defense")
plt.show()
print(len(all_water_class))
plt.scatter(all_water_class.defense,all_water_class.sp_defense)
plt.xlabel("Defense")
plt.ylabel("Sp Defense")
plt.show()
73
print(len(all_normal_class))
plt.scatter(all_normal_class.defense,all_normal_class.sp_defense)
plt.xlabel("Defense")
plt.ylabel("Sp Defense")
plt.show()
60
55.06666666666667
我们选择防御力和特殊防御力作为数据的特征,假设这些数据服从某种正态分布,我们只要找到一个正态分布 参数,就可以用这个函数来估计样本属于类别概率。我们知道可以找到多个概率分布来。这里似然值,就是我们这个概率分布生成这样数据有可能性值,这个值越大
我们如何求均值
- 求均值
- 求方差
water_X = all_water_class[['defense','sp_defense']]
print(water_X.head())
mu = np.average(water_X,axis=0)
print(type(mu))
print(type(water_X.values))
print(water_X.values.shape)
print(mu.shape)
sigma_ = np.average( np.dot((water_X.values - mu),(water_X.values - mu).T) )
print(sigma_)
defense sp_defense
6 65 64
7 80 80
8 120 115
53 48 50
54 78 80
(73, 2)
(2,)
-2.218696026092713e-15
x, y = np.mgrid[-1.0:1.0:30j, -1.0:1.0:30j]
xy = np.column_stack([x.flat, y.flat])
mu = np.array([0.0, 0.0])
sigma = np.array([2,6])
covariance = np.diag(sigma**1)
print(covariance)
z = multivariate_normal.pdf(xy, mean=mu, cov=covariance)
z = z.reshape(x.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(x,y,z)
plt.show()
[[2 0]
[0 6]]