nlp之命名实体识别HMM方法(1)

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time    : 2018/7/26 9:40
# @verion  : python3.6
# @File    : generate_datas.py.py
# @Software: PyCharm
__author__ = 'xiaohu'
hidden_states = ["A", "B", "C", "D", "F", "G", "I", "J", "K", "L", "M", "P", "S", "W", "X", "Z"]


def generate_transition_probability():
    '''
    得到状态转移概率矩阵文本A,每行格式:首状态,次状态,概率[首状态后面为该次状态的概率]
    :return:
    '''
    result = []
    with open('./data/nt.tr.txt', mode='r') as file:
        all_data = file.readlines()
        for line in all_data[1:]:
            split_line = line.strip().split(',')
            first_state = split_line[0]  # 首状态
            sumLineData = sum(int(s) for s in split_line[1:])
            for index, degree in enumerate(split_line[1:]):
                second_state = hidden_states[index]  # 次状态
                result.append([first_state, second_state, float(degree) / sumLineData])
    # print(result)
    # 写入文本
    with open('./data/transition_probability.txt', mode='w') as out_file:
        for thelist in result:
            str_to_write = '%s,%s,%s\n' % (thelist[0], thelist[1], thelist[2])
            out_file.write(str_to_write)
    print('generate transition_probability.txt')


def generate_initial_vector():
    '''
    得到初始化概率向量π,每行格式:状态,出现次数,概率
    :return:
    '''
    the_hidden_states = {x: 0 for x in hidden_states}
    sum_total = 0
    with open('./data/nt.txt', mode='r') as file:
        all_data = file.readlines()
        for line in all_data:
            split_line = line.strip().split(' ')
            states_and_degree = split_line[1:]
            # print(split_line)
            for index in range(0, len(states_and_degree), 2):
                states_dict = states_and_degree[index:index + 2]
                the_hidden_states[states_dict[0]] += eval(states_dict[1])
                sum_total += eval(states_dict[1])
    # print(the_hidden_states)
    # 存入文本
    with open('./data/initial_vector.txt', mode='w') as initial_file:
        for state, degree in the_hidden_states.items():
            str_to_write = '%s,%s,%s\n' % (state, degree, float(degree) / sum_total)
            initial_file.write(str_to_write)
    print('generate initial_vector.txt')


# 列表存东西很方便,字典对于算法中的表达式处理很方便
def generate_emit_probability():
    '''
    生成观测概率矩阵,每行格式为:隐状态,显状态,概率[该隐状态在该显状态下出现的次数 / 该隐状态总共出现的次数]
    :return:
    '''
    result = []
    initial_freqence = get_initial_freq()
    with open('./data/nt.txt', mode='r') as file:
        all_data = file.readlines()
        for line in all_data:
            split_line = line.strip().split(' ')
            observation = split_line[0]
            states_and_degree = split_line[1:]
            for index in range(0, len(states_and_degree), 2):
                state_dict = states_and_degree[index:index + 2]
                result.append(
                    [state_dict[0], observation, float(state_dict[1]) / eval(initial_freqence[state_dict[0]])])
    # print(result)
    with open('./data/emit_probability.txt', mode='w') as emit_file:
        for line in result:
            str_to_write = '%s,%s,%s\n' % (line[0], line[1], line[2])
            emit_file.write(str_to_write)
    print('generate: emit_probability.txt')


def get_initial_freq():
    '''
    获取每个字典出现的频数
    :return:字典;key为标签,value为频数
    '''
    the_hidden_state = {x: 0 for x in hidden_states}
    with open('./data/initial_vector.txt', mode='r') as file:
        all_data = file.readlines()
        for line in all_data:
            line_data = line.strip().split(',')
            the_hidden_state[line_data[0]] = line_data[1]
    return the_hidden_state


if __name__ == '__main__':
    generate_transition_probability()
    generate_initial_vector()
    generate_emit_probability()

你可能感兴趣的:(ner)