代码如下(示例):
import openpyxl
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import sympy
import os
def array_frequent(lst):
from collections import Counter
HF = Counter(lst).most_common(1)
return HF[0][0]
#输入表格名file_data
# 要读该列的行数从1到row_end
def ReadInCol(file_data,ClassNum,row_end):
BoyData = []
GirData = []
ClassNum = ClassNum+1
for i in range(2, row_end):
G = file_data.cell(i,2).value
if G:
BoyData.append(file_data.cell(i,ClassNum).value)
else:
GirData.append(file_data.cell(i,ClassNum).value)
return BoyData,GirData
#男女50米跑直方图显示
# sheet 数据字典
def Meter50_HistShow(sheet):
# 1性别 2籍贯 3身高 4体重 5鞋码 6(50米成绩) 7肺活量 8喜欢颜色 9喜欢运动 10喜欢文学
Boy50, Gir50 = ReadInCol(Tain_sheet,6,210) # 提取男女50米成绩数组
Boy50_Max = max(Boy50)
Boy50_Min = min(Boy50)
Gir50_Max = max(Gir50)
Gir50_Min = min(Gir50)
DataMax = max(Boy50_Max, Gir50_Max)
DataMin = min(Boy50_Min, Gir50_Min)
X_Show = np.linspace(DataMin, DataMax, round(round(DataMax - DataMin) * 2))
plt.hist(Boy50, X_Show, density=1, color='yellowgreen', histtype='bar', alpha=0.5, edgecolor='white', linewidth=4)
plt.hist(Gir50, X_Show, density=1, color='pink', histtype='bar', alpha=0.5, edgecolor='white', linewidth=4)
plt.xlabel('X_hight')
plt.ylabel('Y_Frequency')
plt.title('50MeterTime DistributionHist')
#最大似然估计均值和方差近似
# 男生总数据集 BoySouceData
# 女生总数聚集 GirSouceData
# 从总样本中抽取的随机数据点比例 Proport
def MyMLE_Mean_Vari(BoySouceData,GirSouceData,Proport):
BoyDataLenth = round(Proport*len(BoySouceData))
GirDataLenth = round(Proport*len(GirSouceData))
BoyRam = random.sample(range(0, len(BoySouceData)), BoyDataLenth)
GirRam = random.sample(range(0, len(GirSouceData)), GirDataLenth)
BoyData = []
GirData = []
for i in range(0,BoyDataLenth):
temp = BoyRam[i]
BoyData.append(BoySouceData[temp])
for i in range(0,GirDataLenth):
temp = GirRam[i]
GirData.append(GirSouceData[temp])
BoyDataLenth_T = 1.0/float(BoyDataLenth)
GirDataLenth_T = 1.0/float(GirDataLenth)
BoyMLE_Param = [0,0] #零元素是均值 1元素是方差
GirMLE_Param = [0,0]
for i in range(0,BoyDataLenth):
BoyMLE_Param[0] = BoyMLE_Param[0] + BoyDataLenth_T * BoyData[i]
for i in range(0, BoyDataLenth):
BoyMLE_Param[1] = BoyMLE_Param[1] + math.pow(BoyData[i] - BoyMLE_Param[0],2)
BoyMLE_Param[1] = BoyDataLenth_T * BoyMLE_Param[1]
for i in range(0, GirDataLenth):
GirMLE_Param[0] = GirMLE_Param[0] + GirDataLenth_T * GirData[i]
for i in range(0, GirDataLenth):
GirMLE_Param[1] = GirMLE_Param[1] + math.pow(GirData[i] - GirMLE_Param[0],2)
GirMLE_Param[1] = GirDataLenth_T * GirMLE_Param[1]
return BoyMLE_Param,GirMLE_Param
#大似然估计男女生身高、体重、50m成绩的分布参数显示
#sheet 数据字典
#MLE_Proport 大似然估计的随机抽取比例
def MLE_ProportShow(sheet,MLE_Proport):
BoyHig, GirHig = ReadInCol(sheet, 3, 737) # 男女身高数据 cm
BoyWei, GirWei = ReadInCol(sheet, 4, 737) # 男女体重数据 kg
Boy50m, Gir50m = ReadInCol(sheet, 6, 210) # 男女50米跑数据 s
BoyHigMLE_Param, GirHigMLE_Param = MyMLE_Mean_Vari(BoyHig, GirHig, MLE_Proport)
BoyWeiMLE_Param, GirWeiMLE_Param = MyMLE_Mean_Vari(BoyWei, GirWei, MLE_Proport)
Boy50mMLE_Param, Gir50mMLE_Param = MyMLE_Mean_Vari(Boy50m, Boy50m, MLE_Proport)
print(" 最大似然估计参数 实际参数", " (随机比例", MLE_Proport, ")")
print("项目 性别 平均数 方差 平均数 方差")
print("身高 男 ", round(BoyHigMLE_Param[0], 2), " ", round(BoyHigMLE_Param[1], 2), \
" ", round(np.mean(BoyHig), 2), " ", round(np.var(BoyHig), 2))
print(" 女 ", round(GirHigMLE_Param[0], 2), " ", round(GirHigMLE_Param[1], 2), \
" ", round(np.mean(GirHig), 2), " ", round(np.var(GirHig), 2))
print("体重 男 ", round(BoyWeiMLE_Param[0], 2), " ", round(BoyWeiMLE_Param[1], 2), \
" ", round(np.mean(BoyWei), 2), " ", round(np.var(BoyWei), 2))
print(" 女 ", round(GirWeiMLE_Param[0], 2), " ", round(GirWeiMLE_Param[1], 2), \
" ", round(np.mean(GirWei), 2), " ", round(np.var(GirWei), 2))
print("短跑 男 ", round(Boy50mMLE_Param[0], 2), " ", round(Boy50mMLE_Param[1], 2), \
" ", round(np.mean(Boy50m), 2), " ", round(np.var(Boy50m), 2))
print(" 女 ", round(Gir50mMLE_Param[0], 2), " ", round(Gir50mMLE_Param[1], 2), \
" ", round(np.mean(Gir50m), 2), " ", round(np.var(Gir50m), 2))
#贝叶斯估计男女生身高以及体重分布的参数(已知方差估计平均值)
#BoySouceData 男生数据集
#GirSouceData 女生数据集
#BoyInitPrama 男生平均数的鲜艳概率正态分布参数 [平均值,方差]
#GirInitPrama 女生平均数的鲜艳概率正态分布参数 [平均值,方差]
#Proport 从总样本中随机抽取比例
def BayesEstim_Mean(BoySouceData,GirSouceData,Proport,BoyInitPrama,GirInitPrama):
BoyDataLenth = round(Proport * len(BoySouceData))
GirDataLenth = round(Proport * len(GirSouceData))
BoyRam = random.sample(range(0, len(BoySouceData)), BoyDataLenth)
GirRam = random.sample(range(0, len(GirSouceData)), GirDataLenth)
BoyData = []
GirData = []
BoySum = 0.0
GirSum = 0.0
for i in range(0, BoyDataLenth):
temp = BoyRam[i]
temp = BoySouceData[temp]
BoySum = BoySum + temp
BoyData.append(temp)
for i in range(0, GirDataLenth):
temp = GirRam[i]
temp = GirSouceData[temp]
GirSum = GirSum + temp
GirData.append(temp)
BoyVari = np.var(BoySouceData) #初始数据集方差已知
GirVari = np.var(GirSouceData) #初始数据集方差已知
BayesEstim_NewMean = []
BayesEstim_NewMean.append( BoyInitPrama[1]*BoySum/(BoyDataLenth*BoyInitPrama[1]+BoyVari)\
+BoyVari*BoyInitPrama[0]/(BoyDataLenth*BoyInitPrama[1]+BoyVari))
BayesEstim_NewMean.append(GirInitPrama[1] * GirSum / (GirDataLenth * GirInitPrama[1] + GirVari) \
+ GirVari * GirInitPrama[0] / (GirDataLenth * GirInitPrama[1] + GirVari))
return BayesEstim_NewMean
#贝叶斯估计结果显示
def BayesEstim_MeanShow(sheet,Proport,BoyHighPrama,GirHighPrama,BoyWeigPrama,GirWeigPrama):
# 读入数据集
BoyHig, GirHig = ReadInCol(sheet, 3, 737) # 男女身高数据 cm
BoyWei, GirWei = ReadInCol(sheet, 4, 737) # 男女体重数据 kg
# 贝叶斯估计平均数结果
HigMeanEsti = BayesEstim_Mean(BoyHig, GirHig, Proport, BoyHighPrama, GirHighPrama)
WeiMeanEsti = BayesEstim_Mean(BoyWei, GirWei, Proport, BoyWeigPrama, GirWeigPrama)
print("\n 贝叶斯估计均值 实际均值")
print("身高 男: ", round(HigMeanEsti[0], 2), " ", round(np.mean(BoyHig), 2))
print(" 女: ", round(HigMeanEsti[1], 2), " ", round(np.mean(GirHig), 2))
print("体重 男: ", round(WeiMeanEsti[0], 2), " ", round(np.mean(BoyWei), 2))
print(" 女: ", round(WeiMeanEsti[1], 2), " ", round(np.mean(GirWei), 2))
#协方差矩阵计算
def CovEle(Data1,Data2,Mean1,Mean2):
Length = len(Data1)
temp = 1.0/(Length-1)
Cov00 = 0
Cov01 = 0
Cov10 = 0
Cov11 = 0
for i in range(0,Length):
Cov00 = Cov00 + temp * (Data1[i] - Mean1) * (Data1[i] - Mean1)
Cov01 = Cov01 + temp * (Data1[i] - Mean1) * (Data2[i] - Mean2)
Cov11 = Cov11 + temp * (Data2[i] - Mean2) * (Data2[i] - Mean2)
Cov10 = Cov01
Cov = np.array([[Cov00,Cov01],[Cov10,Cov11]])
return Cov
#判别结果打印
def DiscriminantFunc_TwoClass(New_x,sheet):
BoyHig, GirHig = ReadInCol(sheet, 3, 728) # 男女身高数据 cm
BoyWei, GirWei = ReadInCol(sheet, 4, 728) # 男女体重数据 kg
N = len(BoyHig) + len(GirHig)
# 先计算各均数
BoyHigMean = np.mean(BoyHig) # X11
GirHigMean = np.mean(GirHig) # X21
BoyWeiMean = np.mean(BoyWei) # X12
GirWeiMean = np.mean(GirWei) # X22
# 求均值矩阵
BoyClassMean = np.array([[BoyHigMean],[BoyWeiMean]]) # Mean1
GirClassMean = np.array([[GirHigMean],[GirWeiMean]]) # Mean2
# 协方差矩阵计算
BoyCov = CovEle(BoyHig, BoyWei, BoyHigMean, BoyWeiMean)
GirCov = CovEle(GirHig, GirWei, GirHigMean, GirWeiMean)
# 求ln( |Cov1| / |Cov2| )
ln_Value = math.log(np.linalg.det(BoyCov) \
/ np.linalg.det(GirCov), math.e)
# 求ln( P(w1) / P(w2) )
ln_P_w = math.log((len(BoyHig) / N) / (len(GirHig) / N), math.e)
#求协方差的逆
BoyCov_N = np.linalg.inv(BoyCov)
GirCov_N = np.linalg.inv(GirCov)
#判别函数计算
Func = 0.5*np.dot(np.dot(np.transpose(New_x-BoyClassMean),BoyCov_N),New_x-BoyClassMean) \
-0.5*np.dot(np.dot(np.transpose(New_x-GirClassMean),GirCov_N),New_x-GirClassMean) \
+0.5*ln_Value-ln_P_w
#结果判断
Discriminant = np.linalg.det(Func)
if(Discriminant<0):
print("\n对于样本点", np.transpose(New_x),Func, " 判断性别为->男")
else:
print("对于样本点", np.transpose(New_x),Func, " 判断性别为->女")
def DecisionPlaneSolve(Cass1,Class2,DataNum,sheet):
BoyHig, GirHig = ReadInCol(sheet, Cass1, DataNum) # 男女身高数据 cm
BoyWei, GirWei = ReadInCol(sheet, Class2, DataNum) # 男女体重数据 kg
N = len(BoyHig) + len(GirHig)
# 先计算各均数
BoyHigMean = np.mean(BoyHig) # X11
GirHigMean = np.mean(GirHig) # X21
BoyWeiMean = np.mean(BoyWei) # X12
GirWeiMean = np.mean(GirWei) # X22
# 求均值矩阵
BoyClassMean = sympy.Matrix([[BoyHigMean], [BoyWeiMean]]) # Mean1
GirClassMean = sympy.Matrix([[GirHigMean], [GirWeiMean]]) # Mean2
# 协方差矩阵
BoyCov_TEMP = CovEle(BoyHig, BoyWei, BoyHigMean, BoyWeiMean)
GirCov_TEMP = CovEle(GirHig, GirWei, GirHigMean, GirWeiMean)
BoyCov = sympy.Matrix([ [float(BoyCov_TEMP[[0],[0]]),float(BoyCov_TEMP[[0],[1]])] ,\
[float(BoyCov_TEMP[[1],[0]]),float(BoyCov_TEMP[[1],[1]])] ])
GirCov = sympy.Matrix([ [float(GirCov_TEMP[[0],[0]]),float(GirCov_TEMP[[0],[1]])] ,\
[float(GirCov_TEMP[[1],[0]]),float(GirCov_TEMP[[1],[1]])] ])
# 求ln( |Cov1| / |Cov2| )
ln_Value = sympy.ln(BoyCov.det()/GirCov.det())
# 求ln( P(w1) / P(w2) )
ln_P_w = sympy.ln( (len(BoyHig)/N) / (len(GirHig)/N) )
# 判别函数求解
h, w = sympy.symbols('h w')
Func = (0.5 * (sympy.Matrix([[h],[w]]) - BoyClassMean).T * (BoyCov**(-1)) * (sympy.Matrix([[h],[w]]) - BoyClassMean)).det() \
-(0.5 * (sympy.Matrix([[h], [w]]) - GirClassMean).T * (GirCov**(-1)) * (sympy.Matrix([[h], [w]]) - GirClassMean)).det()\
+ 0.5 * ln_Value - ln_P_w
SolveFunc = sympy.solve(Func, [h, w])
#决策面显示
Y_Weight = np.linspace(40, 70, 200)
X_Hight = 0.247822699671129 * Y_Weight + 79.3792911179188 * np.sqrt(
-0.000932386643406855 * Y_Weight ** 2 + 0.083558110408116 * Y_Weight - 1) + 82.8949394272113
plt.plot(Y_Weight, X_Hight)
#男女特征点显示
plt.scatter(BoyWei, BoyHig, color='yellowgreen')
plt.scatter(GirWei, GirHig, color='pink')
plt.xlabel('Hight')
plt.ylabel('Weight')
plt.title('Height-weight characteristic distribution map')
return SolveFunc
####################################### main #############################################
#数据打开文件路径
Tain_set = openpyxl.load_workbook(os.path.abspath('data.xlsx'))
#训练数据读取#(4)
Tain_sheet = Tain_set["Sheet1"]
#(1)男女50米跑直方图显示
Meter50_HistShow(Tain_sheet)
#(2)最大似然估计 男女生身高、体重以及50m成绩的分布参数
MLE_ProportShow(Tain_sheet,0.5)
#(3)贝叶斯估计男女生身高以及体重分布的参数(已知方差估计平均值)
# 需要估计的平均值服从 0元素=平均值 1元素=方差 的正太分布
BoyHigMeanEsti_Aver_Vari = [170,15] #先验概率,经验猜测其分布
GirHigMeanEsti_Aver_Vari = [160,10]
BoyWeiMeanEsti_Aver_Vari = [60,10]
GirWeiMeanEsti_Aver_Vari = [42,5]
BayesEstim_MeanShow(Tain_sheet,0.5,BoyHigMeanEsti_Aver_Vari,GirHigMeanEsti_Aver_Vari,\
BoyWeiMeanEsti_Aver_Vari,GirWeiMeanEsti_Aver_Vari)
#(4)最小错误率贝叶斯决策做身高和体重的决策面
x = np.mat('178;71')
DiscriminantFunc_TwoClass(x,Tain_sheet)
x = np.array([[170],[52]])
DiscriminantFunc_TwoClass(x,Tain_sheet)
#求解得到决策面方程
plt.show()
DecisionPlane = DecisionPlaneSolve(3,4,728,Tain_sheet)
print(DecisionPlane)
plt.show()