下面举例说明:有一组招聘的信息,特征为“是否985”、“学历”、“技能”,分类为“录用”、“不录用”。
示例代码:
#是否985(0-否 1-是) 学历(1-本科 2-硕士 3-博士) 技能(1-C++ 2-JAVA) 是否录用(0-否 1-是)
samples = [[1, 1, 1, 0],
[1, 1, 2, 1],
[0, 2, 2, 1],
[0, 2, 1, 0],
[1, 1, 2, 1],
[0, 2, 1, 0],
[1, 2, 2, 1],
[1, 3, 1, 1],
[0, 3, 2, 1],
[0, 1, 2, 0]]
#统计
feature_cnt = 3
c_statics = {}
f_c_statics = {}
def static():
for sample in samples:
c_value = sample[feature_cnt]
key_str = 'c{c_value}'.format(c_value=c_value)
if key_str in c_statics:
c_statics[key_str] += 1
else:
c_statics[key_str] = 1
for i in range(feature_cnt):
f_value = sample[i]
key_str = 'f{f_idx}_{f_value}_c{c_value}'.format(f_idx=i, f_value=f_value, c_value=c_value)
if key_str in f_c_statics:
f_c_statics[key_str] += 1
else:
f_c_statics[key_str] = 1
print(c_statics)
print(f_c_statics)
#计算P(xi|C=cj)
def calc_f_c_probability(f_idx, f_value, c_value):
key_str = 'c{c_value}'.format(c_value=c_value)
if key_str not in c_statics:
return 0
c_cnt = c_statics[key_str]
key_str = 'f{f_idx}_{f_value}_c{c_value}'.format(f_idx=f_idx, f_value=f_value, c_value=c_value)
if key_str not in f_c_statics:
return 0
f_c_cnt = f_c_statics[key_str]
f_c_p = f_c_cnt / c_cnt
#print(f_c_p)
return f_c_p
#计算P(C=cj|X)
def calc_c_x_probability(f1_value, f2_value, f3_value, c_value):
sample_cnt = len(samples)
key_str = 'c{c_value}'.format(c_value=c_value)
if key_str not in c_statics:
return 0
p_c = c_statics[key_str] / sample_cnt
p_f1_c = calc_f_c_probability(0, f1_value, c_value)
p_f2_c = calc_f_c_probability(1, f2_value, c_value)
p_f3_c = calc_f_c_probability(2, f3_value, c_value)
P = p_c * p_f1_c * p_f2_c * p_f3_c
#print(P)
return P
if __name__ == "__main__":
static()
p_c1 = calc_c_x_probability(1, 2, 1, 1)
p_c2 = calc_c_x_probability(1, 2, 1, 0)
print(p_c1)
print(p_c2)
运行结果:
p_c1 = 0.022222222222222216
p_c2 = 0.037500000000000006
因此判断样本('985', '硕士', 'C++')的分类为c2-“不录用”。