pyltp实战

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 29 09:36:34 2018

@author: Robert

https://pyltp.readthedocs.io/zh_CN/latest/api.html#id2

注意和准备事项
1. 中文文本编码必须为utf-8。
2. 准备好ltp_data_v3.4.0数据包。
"""

#导入/读取自己的中文语料

import os
input_file = r"D:/nltk_data/mycorpora/三毛/撒哈拉的故事.txt"
input_file2 = r"D:/MoYan/蛙.txt"
LTP_DATA_DIR = r"D:/NLTK_data/mycorpora/ltp_data_v3.4.0"  # ltp模型目录的路径
seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  #分词模型路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`


#分句
from pyltp import SentenceSplitter
sents = SentenceSplitter.split("元芳你怎么看?我就趴窗口上看呗!")  #字符串
print("\n".join(sents))

raw = open(input_file, encoding = "utf-8").read()  #文件
sents = SentenceSplitter.split(raw)
print("\n".join(sents))

raw = open(input_file2, encoding = "utf-8").read()  #文件
sents = SentenceSplitter.split(raw)
print("\n".join(sents))

out = open(r"D:/MyResult.txt", "w", encoding = "utf-8")  #输出到文件
out.write("\n".join(sents))
out.close()


#分词
from pyltp import Segmentor
segmentor = Segmentor() # 初始化实例
segmentor.load(seg_model_path)  # 加载模型
words = segmentor.segment("元芳你怎么看")  #字符串
print("\n".join(words))

raw = open(input_file, encoding = 'utf-8').read()
words = segmentor.segment(raw)  #文件
print("\n".join(words))
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8")  #输出到文件
out.write(" ".join(words))
out.close()

segmentor.release()  # 释放模型


#词性标注
from pyltp import Segmentor
from pyltp import Postagger
segmentor = Segmentor() # 初始化实例
postagger = Postagger() # 初始化实例
segmentor.load(seg_model_path)  # 加载模型
postagger.load(pos_model_path)  # 加载模型
words = ['元芳', '你', '怎么', '看']  # 分词结果
postags = postagger.postag(words)  # 词性标注
print("\t".join(postags))

raw = open(input_file2, encoding = 'utf-8').read()
words = segmentor.segment(raw)  #文件
postags = postagger.postag(words)  # 词性标注
result = zip(words, postags)
result_list = []
for i in result:
    word, pos = i
    result_list.append(word + "/" + pos)

out = open(r"D:/MyResult.txt", "w", encoding = "utf-8")  #输出到文件
out.write(" ".join(result_list))
out.close()

segmentor.release()  # 释放模型
postagger.release()  # 释放模型


#依存句法分析
import os
input_file = r"D:/nltk_data/mycorpora/三毛/撒哈拉的故事.txt"
input_file2 = r"D:/MoYan/蛙.txt"
LTP_DATA_DIR = r"D:/NLTK_data/mycorpora/ltp_data_v3.4.0"  # ltp模型目录的路径
seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  #分词模型路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import Parser
segmentor = Segmentor() # 初始化实例
postagger = Postagger() # 初始化实例
parser = Parser() # 初始化实例
segmentor.load(seg_model_path)  # 加载模型
postagger.load(pos_model_path)  # 加载模型
parser.load(par_model_path)  # 加载模型
words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
arcs = parser.parse(words, postags)  # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

raw = open(input_file2, encoding = 'utf-8').read()
sents = SentenceSplitter.split(raw)  #必须分句,否则出错
outStr = ""
for i in range(len(sents)):
    outStr = outStr + sents[i] + "\n"
    words = segmentor.segment(sents[i])  #文件
    outStr = outStr + "\t".join(words) + "\n"
    postags = postagger.postag(words)  # 词性标注
    outStr = outStr + "\t".join(postags) + "\n"
    arcs = parser.parse(words, postags)  # 句法分析
    tmpStr = "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
    outStr = outStr + tmpStr + "\n\n"
    
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8")  #输出到文件
out.write(outStr)
out.close()
segmentor.release()  # 释放模型
postagger.release()  # 释放模型
parser.release()  # 释放模型

 

你可能感兴趣的:(pyltp实战)