import numpy as np
"""
函数说明:创建实验样本
Parameters:无
Returns:
postingList - 实验样本切分的词条
classList - 类别标签
Modify:
2019-03-23
"""
def loadDataSet():
postingList = [["打喷嚏", "护士"],
["打喷嚏", "农夫"],
["头痛", "建筑工人"],
["头痛", "建筑工人"],
["打喷嚏", "教师"],
["头痛", "教师"]]
classList = ["感冒", "过敏", "脑震荡", "感冒", "感冒", "脑震荡"]
print("词条集合:\n", np.array(postingList))
print("标签集合:\n", classList)
return postingList, classList
"""
函数说明:生成“词汇表”
Parameters:
postingList - 实验样本划分的词条
classList - 标签
Returns:
vocabulary - 词汇表
labels - 标签表
Modify:
2019-03-23
"""
def createVocabulary(postingList, classList):
vocabulary = []
for words in postingList:
for word in words:
if word not in vocabulary:
vocabulary.append(word)
print("字典:\n", vocabulary)
labels = []
for lab in classList:
if lab not in labels:
labels.append(lab)
print("标签表:\n", labels)
return vocabulary, labels
"""
函数说明:词条向量化、标签向量化
Parameters:
postingList - 划分好的词条集合
classList - 便签集合
vocabulary - 词汇表
labels - 标签表
Returns:
postingVec - 向量化词条
classVec - 向量化标签
Modify:
"""
def vector(postingList, classList, vocabulary, labels):
# 词条向量化
postingVecSet = [] # 词条向量的集合
for words in postingList:
postingVec = [0] * len(vocabulary)
for i in range(len(words)):
if words[i] in vocabulary:
postingVec[vocabulary.index(words[i])] = 1
postingVecSet.append(postingVec)
# 标签向量化
classVec = []
for i in range(len(classList)):
classVec.append(labels.index(classList[i]))
print("向量化词条;\n", np.array(postingVecSet))
print("向量化标签:\n", classVec)
return postingVecSet, classVec
"""
函数说明:朴素贝叶斯分类器训练器
Parameters:
postingVecSet - 向量化词条
classVec - 向量化标签
Returns:
P0Vector - [ P( 感冒 |症状、职业) ]
P1Vector - [ P( 过敏 |症状、职业) ]
P2Vector - [ P(脑震荡|症状、职业) ]
PA - P( 感冒 )
PB - P( 过敏 )
PC - P(脑震荡)
Modify:
2019-03-23
"""
def train(postingVecSet, classVec):
PA = 0
PB = 0
PC = 0
for disease in classVec:
if disease == 0: # 统计感冒数
PA += 1
elif disease == 1: # 统计过敏数
PB += 1
else: # 统计脑震荡数
PC += 1
# 计算 P(感冒) P(过敏) P(脑震荡)
PA = PA / float(len(classVec)) # P(感冒)
PB = PB / float(len(classVec)) # P(过敏)
PC = PC / float(len(classVec)) # P(脑震荡)
print("感冒概率:", PA, "过敏概率:", PB, "脑震荡概率:", PC)
P0Vector = np.ones(len(postingVecSet[0]))
P1Vector = np.ones(len(postingVecSet[0]))
P2Vector = np.ones(len(postingVecSet[0]))
P0Demon = 2.0
P1Demon = 2.0
P2Demon = 2.0
for i in range(len(classVec)):
if classVec[i] == 0:
P0Vector += postingVecSet[i]
P0Demon += sum(postingVecSet[i])
elif classVec[i] == 1:
P1Vector += postingVecSet[i]
P1Demon += sum(postingVecSet[i])
else:
P2Vector += postingVecSet[i]
P2Demon += sum(postingVecSet[i])
P0Vector = np.log(P0Vector / P0Demon)
P1Vector = np.log(P1Vector / P1Demon)
P2Vector = np.log(P2Vector / P2Demon)
print("P0Vector:", P0Vector)
print("P1Vector:", P1Vector)
print("P2Vector:", P2Vector)
return PA, PB, PC, P0Vector, P1Vector, P2Vector
"""
函数说明:使用分类器进行分类
Parameters:
test - 测试用例(向量)
P0Vector - [ P( 感冒 |症状、职业) ]
P1Vector - [ P( 过敏 |症状、职业) ]
P2Vector - [ P(脑震荡|症状、职业) ]
PA - P( 感冒 )
PB - P( 过敏 )
PC - P(脑震荡)
Returns:
0 - 感冒
1 - 过敏
2 - 脑震荡
Modify:
2019-03-23
"""
def classification(test, P0Vector, P1Vector, P2Vector, PA, PB, PC):
PA = sum(P0Vector * test) + np.log(PA)
PB = sum(P1Vector * test) + np.log(PB)
PC = sum(P2Vector * test) + np.log(PC)
print("感冒的概率:", PA, "过敏的概率:", PB, "脑震荡的概率:", PC)
if max(PA, PB, PC) == PA:
print("最可能的疾病:感冒")
elif max(PA, PB, PC) == PB:
print("最可能的疾病:过敏")
else:
print("最可能的疾病:脑震荡")
return max(PA, PB, PC)
if __name__ == '__main__':
postingList, classList = loadDataSet()
vocabulary, labels = createVocabulary(postingList, classList)
postingVecSet, classVec = vector(postingList, classList, vocabulary, labels)
PA, PB, PC, P0Vector, P1Vector, P2Vector = train(postingVecSet, classVec)
for zhengZhuang in ['打喷嚏', '头痛']:
for zhiYe in ['护士', '农夫', '建筑工人', '教师']:
test = np.zeros(len(vocabulary))
test[vocabulary.index(zhengZhuang)] = 1
test[vocabulary.index(zhiYe)] = 1
print("症状:", zhengZhuang, "职业:", zhiYe)
classification(test, P0Vector, P1Vector, P2Vector, PA, PB, PC)
print("\n")
运行结果:
D:\PyCharm\Projects\MachineLearning\venv\Scripts\python.exe D:/PyCharm/Projects/MachineLearning/BeiYesi_YiYuan.py
词条集合:
[['打喷嚏' '护士']
['打喷嚏' '农夫']
['头痛' '建筑工人']
['头痛' '建筑工人']
['打喷嚏' '教师']
['头痛' '教师']]
标签集合:
['感冒', '过敏', '脑震荡', '感冒', '感冒', '脑震荡']
字典:
['打喷嚏', '护士', '农夫', '头痛', '建筑工人', '教师']
标签表:
['感冒', '过敏', '脑震荡']
向量化词条;
[[1 1 0 0 0 0]
[1 0 1 0 0 0]
[0 0 0 1 1 0]
[0 0 0 1 1 0]
[1 0 0 0 0 1]
[0 0 0 1 0 1]]
向量化标签:
[0, 1, 2, 0, 0, 2]
感冒概率: 0.5 过敏概率: 0.16666666666666666 脑震荡概率: 0.3333333333333333
P0Vector: [-0.98082925 -1.38629436 -2.07944154 -1.38629436 -1.38629436 -1.38629436]
P1Vector: [-0.69314718 -1.38629436 -0.69314718 -1.38629436 -1.38629436 -1.38629436]
P2Vector: [-1.79175947 -1.79175947 -1.79175947 -0.69314718 -1.09861229 -1.09861229]
症状: 打喷嚏 职业: 护士
感冒的概率: -3.0602707946915624 过敏的概率: -3.8712010109078907 脑震荡的概率: -4.68213122712422
最可能的疾病:感冒
症状: 打喷嚏 职业: 农夫
感冒的概率: -3.7534179752515073 过敏的概率: -3.1780538303479453 脑震荡的概率: -4.68213122712422
最可能的疾病:过敏
症状: 打喷嚏 职业: 建筑工人
感冒的概率: -3.0602707946915624 过敏的概率: -3.8712010109078907 脑震荡的概率: -3.9889840465642745
最可能的疾病:感冒
症状: 打喷嚏 职业: 教师
感冒的概率: -3.0602707946915624 过敏的概率: -3.8712010109078907 脑震荡的概率: -3.9889840465642745
最可能的疾病:感冒
症状: 头痛 职业: 护士
感冒的概率: -3.4657359027997265 过敏的概率: -4.564348191467836 脑震荡的概率: -3.58351893845611
最可能的疾病:感冒
症状: 头痛 职业: 农夫
感冒的概率: -4.1588830833596715 过敏的概率: -3.8712010109078907 脑震荡的概率: -3.58351893845611
最可能的疾病:脑震荡
症状: 头痛 职业: 建筑工人
感冒的概率: -3.4657359027997265 过敏的概率: -4.564348191467836 脑震荡的概率: -2.8903717578961645
最可能的疾病:脑震荡
症状: 头痛 职业: 教师
感冒的概率: -3.4657359027997265 过敏的概率: -4.564348191467836 脑震荡的概率: -2.8903717578961645
最可能的疾病:脑震荡
Process finished with exit code 0