自然语言处理实验代码

实验二代码摘自知乎,其他实验代码修改自课本。
实验一,基于规则的分词算法

from pyhanlp import *

def load_dictionary():
    IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')
    path = HanLP.Config.CoreDictionaryPath.replace('.txt', 'mini.txt')
    dic = IOUtil.loadDictionary([path])
    return set(dic.keySet())

def fully_segment(text, dic):
    word_list = []
    for i in range(len(text)):
        for j in range(len(text)):
            word = text[i:j]
            if word in dic:
                word_list.append(word)
    return word_list

def forward_segment(text, dic):
    word_list = []
    i = 0
    while i < len(text):
        longest_word = text[i]
        for j in range(i+1, len(text) + 1):
            word = text[i:j]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
        word_list.append(longest_word)
        i += len(longest_word)
    
    return word_list

def backward_segment(text, dic):
    word_list = []
    i = len(text) - 1
    while i >= 0:
        longest_word = text[i]
        for j in range(0, i):
            word = text[j: i+1]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
                    break
        word_list.insert(0, longest_word)
        i -= len(longest_word)
    return word_list

def count_single_char(word_list:list):
    return sum(1 for word in word_list if len(word) == 1)

def bidirectional_segment(text, dic):
    f = forward_segment(text, dic)
    b = backward_segment(text, dic)
    if len(f) < len(b):
        return f
    elif len(f) > len(b):
        return b
    else:
        if count_single_char(f) < count_single_char(b):
            return f
        else:
            return b
 
dic = load_dictionary()
text = ['项目的研究', '商品和服务', '研究生命起源', '当下雨天地面积水', '结婚的和尚未结婚的', '欢迎新老师生前来就餐']

for i in text:
    print('正向最长匹配:' + str(forward_segment(i, dic)))
    print('逆向最长匹配:' + str(backward_segment(i, dic)))
    print('双向最长匹配:' + str(bidirectional_segment(i, dic)))
    print('---------------------------------------------')

自然语言处理实验代码_第1张图片

实验二,隐马尔可夫求πAB

import numpy as np

def train(fileName):

    # HMM模型由三要素决定 lambda=(A,B,pi)
    # A为状态转移矩阵
    # B为观测概率矩阵
    # pi为初始状态概率向量

    # 在该函数中,我们需要通过给定的训练数据(包含S个长度相同的观测序列【每一句话】和对应的状态序列【每一句话中每个词的词性】

    # 在中文分词中,包含一下集中状态(词性)
    # B:词语的开头(单词的头一个字)
    # M:中间词(即在一个词语的开头和结尾之中)
    # E:单词的结尾(即单词的最后一个字)
    # S:单个字

    # 定义一个状态映射字典。方便我们定位状态在列表中对应位置
    status2num={'B':0,'M':1,'E':2,'S':3}

    # 定义状态转移矩阵。总共4个状态,所以4x4
    A=np.zeros((4,4))

    #定义观测概率矩阵
    #在ord中,中文编码大小为65536,总共4个状态
    #所以B矩阵4x65536
    #就代表每一种状态(词性)得到观测状态(字)
    B=np.zeros((4,65536))

    # 初始状态,每一个句子的开头只有4中状态(词性)
    PI=np.zeros(4)

    with open(fileName,encoding='utf-8') as file:

        for line in file.readlines():
            wordStatus=[]#用于保存该行所有单词的状态
            words=line.strip().split() #除去前后空格,然后依照中间空格切分为单词

            for i,word in enumerate(words):

                # 根据长度判断状态
                if len(word)==1:
                    status='S'# 保存每一个单词状态
                    code=ord(word)
                    B[status2num[status[0]]][code]+=1

                else:
                    # 当长度为2,M*0。这样可以一起更新
                    status='B'+(len(word)-2)*'M'+'E'
                    for s in range(len(word)):
                        code=ord(word[s])
                        B[status2num[status[s]]][code]+=1

                # i==0意味着这是句首。我们需要更新PI中每种状态出现次数
                if i==0:
                    PI[status2num[status[0]]]+=1

                # 使用extend,将status中每一个元素家在列表之中。而不是append直接将整个status放在后面
                wordStatus.extend(status)

            for i in range(1,len(wordStatus)):
                # wordStatus获得状态,使用status2num来映射到正确位置
                A[status2num[wordStatus[i-1]]][status2num[wordStatus[i]]]+=1

    # 计算PI向量
    total=sum(PI)
    for i in range(len(PI)):
        if PI[i]==0:
            PI[i]=-3.14e+100
        else:
            # 别忘了去取对数
            PI[i]=np.log(PI[i]/total)

    # 计算A矩阵
    for i in range(len(A)):
        total=sum(A[i])
        for j in range(len(A[i])):
            if A[i][j]==0:
                A[i][j]=-3.14e+100
            else:
                A[i][j]=np.log(A[i][j]/total)
    # 更新B矩阵

    for i in range(len(B)):
        total=sum(B[i])
        for j in range(len(B[i])):
            if B[i][j]==0:
                B[i][j]=-3.14e+100
            else:
                B[i][j]=np.log(B[i][j]/total)

    # 返回三个参数
    return (PI,A,B)

PI, A, B = train('D:/自然语言/HMMTrainSet.txt')

print('    初始矩阵π的维度:' + str(PI.shape))
print('状态转移矩阵A的维度:' + str(A.shape))
print('状态发射矩阵B的维度: ' + str(B.shape))
print('----------------------------------')
print('π的值为:')
print(PI)
print('A的值为:')
print(str(A))
print('B维度过大不予展示')

自然语言处理实验代码_第2张图片

实验三,隐马尔可夫维特比算法分词

from pyhanlp import *
import os
import zipfile
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
CWSEvaluator = SafeJClass('com.hankcs.hanlp.seg.common.CWSEvaluator')

def test_data_path():
    """
    获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path

def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path

sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')
msr_model = os.path.join(test_data_path(), 'msr_cws')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')



def train(corpus, model):
    segmenter = HMMSegmenter(model)
    segmenter.train(corpus)
    print(segmenter.segment('商品和货币'))
    return segmenter.toSegment()


def evaluate(segment):
    result = CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict)
    print(result)


if __name__ == '__main__':
    segment = train(msr_train, FirstOrderHiddenMarkovModel())
    evaluate(segment)
    

在这里插入图片描述

实验四,K-means文本聚类

from pyhanlp import *

ClusterAnalyzer = JClass('com.hankcs.hanlp.mining.cluster.ClusterAnalyzer')

analyzer = ClusterAnalyzer()
analyzer.addDocument("赵一", "流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 蓝调, 蓝调, 蓝调, 蓝调, 蓝调, 蓝调, 摇滚, 摇滚, 摇滚, 摇滚")
analyzer.addDocument("钱二", "爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲")
analyzer.addDocument("张三", "古典, 古典, 古典, 古典, 民谣, 民谣, 民谣, 民谣")
analyzer.addDocument("李四", "爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 金属, 金属, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲")
analyzer.addDocument("王五", "流行, 流行, 流行, 流行, 摇滚, 摇滚, 摇滚, 嘻哈, 嘻哈, 嘻哈")
analyzer.addDocument("马六", "古典, 古典, 古典, 古典, 古典, 古典, 古典, 古典, 摇滚")
print(analyzer.kmeans(3))

在这里插入图片描述

你可能感兴趣的:(自然语言处理,nlp)