import pickle
import codecs
A_dic = {}
B_dic = {}
Pi_dic = {}
state_list = ['B', 'M', 'E', 'S']
state_M = 4
word_N = 0
Pi_dic_size = 0.0
A_row_count_dic = {}
B_dic_element_size = {}
PROB_SATRT = "prob_start.py"
INPUT_DATA = "RenMinData.txt"
PROB_EMIT = "prob_emit.py"
PROB_TRANS = "prob_trans.py"
def init():
global Pi_dic
global B_dic
global A_dic
global A_row_count_dic
for state in state_list:
A_dic[state] = {}
for state1 in state_list:
A_dic[state][state1] = 0.0
for state in state_list:
Pi_dic[state] = 0.0
B_dic[state] = {}
A_row_count_dic[state] = 0
def getList(input_str):
output_str = []
input_str_length = len(input_str)
if input_str_length == 1:
output_str.append('S')
else:
Middle_Num = input_str_length-2
output_str.append('B')
output_str.extend(['M']*Middle_Num)
output_str.append('E')
return output_str
def main(train_file_path):
init()
global Pi_dic_size
global word_N
global B_dic_element_size
global A_row_count_dic
global B_dic
train_file = codecs.open("copy.txt", "rb", "utf8")
train_file.read(1)
for line in train_file:
if not line:
continue
word_list = line.split(" ")
line_state=[]
for word in word_list:
line_state.append(getList(word))
print line_state
if len(line_state)!=len(word_list):
print "different length for a word and the corresponding state"
return
for i in range(len(line_state)):
Pi_dic[line_state[i][0]] += 1
Pi_dic_size += len(line_state)
for j in range(len(word_list[i])):
if word_list[i][j] not in B_dic[line_state[i][j]]:
utfWordList=word_list[i][j].encode('utf-8')
B_dic[line_state[i][j]][utfWordList] = 1.0
else:
B_dic[line_state[i][j]][word_list[i][j].encode('utf-8')] += 1
if word_list[i][j] not in B_dic_element_size:
B_dic_element_size[word_list[i][j].encode('utf-8')] = 1
else:
B_dic_element_size[word_list[i][j].encode('utf-8')] += 1
if j < len(line_state[i]) - 1:
A_dic[line_state[i][j]][line_state[i][j + 1]] += 1
A_row_count_dic[line_state[i][j]] += 1
print B_dic_element_size
train_file.close()
probs()
def probs():
PROB_SATRT = "prob_start.py"
INPUT_DATA = "RenMinData.txt"
PROB_EMIT = "prob_emit.py"
PROB_TRANS = "prob_trans.py"
global Pi_dic
global Pi_dic_size
global B_dic
global A_dic
global B_dic_element_size
global A_row_count_dic
start_fp = open(PROB_SATRT, 'w')
emit_fp = open(PROB_EMIT, 'w')
trans_fp = open(PROB_TRANS, 'w')
print "-------------------以下Pi向量------------------------"
for key in Pi_dic:
Pi_dic[key] = Pi_dic[key] / Pi_dic_size
print Pi_dic
print "-------------------以下是状态转移矩阵------------------------"
for key in A_dic:
for key2 in A_dic[key]:
if A_row_count_dic[key] != 0:
A_dic[key][key2] = A_dic[key][key2]/A_row_count_dic[key]
print A_dic
print "------------------以下是混淆矩阵-----------------------"
for key in B_dic:
for key1 in B_dic[key]:
B_dic[key][key1]=B_dic[key][key1]/B_dic_element_size[key1]
for item in B_dic:
for key in B_dic[item] :
print item, '-->', key,B_dic[item][key],' ',
pickle.dump(A_dic, start_fp)
pickle.dump(B_dic, emit_fp)
pickle.dump(Pi_dic, trans_fp)
start_fp.close()
emit_fp.close()
trans_fp.close()
main("RenMinData.txt")