pip install hanlp
tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
其中的’PKU_NAME_MERGED_SIX_MONTHS_CONVSEG’会自动下载解压tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
words = tokenizer(text)
加载模型即可使用,支持自定义用户词典,可多条并行,添加用户词典的方式如下
from hanlp.common.trie import Trie
trie = Trie()
trie.update({'自定义': 'custom', '词典': 'dict', '聪明人': 'smart'})
def split_sents(text: str, trie: Trie):
words = trie.parse_longest(text)
sents = []
pre_start = 0
offsets = []
for word, value, start, end in words:
if pre_start != start:
sents.append(text[pre_start: start])
offsets.append(pre_start)
pre_start = end
if pre_start != len(text):
sents.append(text[pre_start:])
offsets.append(pre_start)
return sents, offsets, words
print(split_sents(text, trie))
def merge_parts(parts, offsets, words):
items = [(i, p) for (i, p) in zip(offsets, parts)]
items += [(start, [word]) for (word, value, start, end) in words]
# In case you need the tag, use the following line instead
# items += [(start, [(word, value)]) for (word, value, start, end) in words]
return [each for x in sorted(items) for each in x[1]]
tokenizer = hanlp.pipeline() \
.append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
.append(tokenizer, input_key='parts', output_key='tokens') \
.append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
print(tokenizer(text))
添加用户词典方法较为复杂,还是jieba分词相对完善,jieba加载用户词典如下:
jieba.load_userdict("userdict.txt")
与加载停用词使用的txt文件同种格式即可
基于分词结果进行词性标注,同样是加载模型即可,这里Hanlp加载的模型需要区分中英文,使用的是fasttext预训练模型
tagger_EN = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
tagger_CN = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN_FASTTEXT_ZH)
res = tagger_CN(words) #输入分词结果,list形式
Hanlp的词性标注具体可以划分到43种,相较pyltp的多了一些,详细可以参考其相关书籍《自然语言处理入门》P225-P231,基于的是
北京大学计算机语言学研究所《现代汉语语料库加工规范词语切分与词性标注》,增加的一些如Ag形语素,Bg区别语素等,多为语素类型
NER功能同样是直接Load模型,区分中英文,这里的模型是基于BERT预训练,准确率相对高,下面是原作的两个实例:
recognizer_CN = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
recognizer_EN = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_UNCASED_EN)
recognizer_CN([list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'),
list('萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。')])
recognizer_EN(["President", "Obama", "is", "speaking", "at", "the", "White", "House"])
#得到结果
[[('上海华安工业(集团)公司', 'NT', 0, 12), ('谭旭光', 'NR', 15, 18), ('张晚霞', 'NR', 21, 24), ('美国', 'NS', 26, 28), ('纽约现代艺术博物馆', 'NS', 28, 37)],
[('萨哈夫', 'NR', 0, 3), ('伊拉克', 'NS', 5, 8), ('联合国销毁伊拉克大规模杀伤性武器特别委员会', 'NT', 10, 31)]]
[('Obama', 'PER', 1, 2), ('White House', 'LOC', 6, 8)]
参考《自然语言处理入门》第8章内容,在一般实体识别应用中,如音译人名,日本人名等需要基于规则,
还有一些是需要基于角色标注如中文人名地名机构名,再就是目前应用较多的基于序列标注进行学习,可以自定
类型,如引用比较多的医疗名词那个项目
基于分词和词性标注的结果,可以进一步进行依存句法分析,需要的输入是[(word,tag)…]形式
syntactic_parser_EN = hanlp.load(hanlp.dep.PTB_BIAFFINE_DEP_EN)
syntactic_parser_CN = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
syntactic_parser_CN([('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')])
#得到结果
1 蜡烛 _ NN _ _ 4 nsubj _ _
2 两 _ CD _ _ 3 nummod _ _
3 头 _ NN _ _ 4 dep _ _
4 烧 _ VV _ _ 0 root _ _
生成的是CoNLL格式,可以下载dependency viewer进行可视化查看,查看.conll的树库文件。相较于LTP的依存句法分析,后者
没有直接输出树库格式,不过也可以自己存,但是双方的依存关系使用还是有一定的区别。
Hanlp详见《自然语言处理入门》P330-P331,依存关系较LTP更多也更细致
分中英文调用模型,需要的输入与依存句法分析的输入一致,是[(word,tag)…]形式,哈工大LTP在同样任务中
需要的是三个输入,结合前面的依存句法一起分析
semantic_parser_EN = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
semantic_parser_CN = hanlp.load(hanlp.pretrained.sdp.SEMEVAL16_NEWS_BIAFFINE_ZH)
#结果示例
1 蜡烛 _ NN _ _ 3 Poss _ _
1 蜡烛 _ NN _ _ 4 Pat _ _
2 两 _ CD _ _ 3 Quan _ _
3 头 _ NN _ _ 4 Loc _ _
4 烧 _ VV _ _ 0 Root _ _
相较于LTP对语义的划分,Hanlp也更精细更多元
使用Hanlp辅助进行文本分类,内核是朴素贝叶斯算法,Hanlp有直接打包好的可进行调用:
from pyhanlp import SafeJClass
NaiveBayesClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
一个基于MSR语料库的源码:https://github.com/NLP-LOVE/Introduction-NLP/blob/master/code/ch11/text_classification.py
朴素贝叶斯基本原理的自代码实现(自己之前在学习统计学习方法时自写实现):
import pandas as pd
from collections import Counter
from numpy import *
import jieba
def get_data():
sentence = pd.read_excel("data/learn_naive_bayes_data_ch.xlsx", sheet_name = "Sheet1", usecols=[0])
tag = pd.read_excel("data/learn_naive_bayes_data_ch.xlsx", sheet_name = "Sheet1", usecols=[1])
sentence_list = []
tag_list = []
for i in range(len(tag)):
cut_sentence = str(sentence.loc[i].tolist()[0]).replace(",","")
cut_list = jieba.lcut(cut_sentence,cut_all=True)
sentence_list.append(cut_list)
tag_list.append(tag.loc[i].tolist()[0])
global tag_total_list
global tag_num
tag_total_list = list(set(tag_list))
tag_num = len(tag_total_list)
return sentence_list,tag_list
def get_vocab(sentence_list):
vocab_list = []
for i in range(len(sentence_list)):
vocab_list += sentence_list[i]
vocab_list = list(set(vocab_list))
return vocab_list
def get_sentence_vec(sentence_list,vocab_list):
sentence_vec_list = []
for i in range(len(sentence_list)):
sentence_vec_list.append([])
for word in vocab_list:
if word in sentence_list[i]:
sentence_vec_list[i].append(1)
else:
sentence_vec_list[i].append(0)
return sentence_vec_list
def get_input_vec(input_sentence,vocab_list):
input_sentence = input_sentence.replace(",","")
input_sentence_list = jieba.lcut(input_sentence,cut_all = True)
input_sentence_vec = []
for word in vocab_list:
if word in input_sentence_list:
input_sentence_vec.append(1)
else:
input_sentence_vec.append(0)
return input_sentence_vec
def train_NBclassfier(sentence_vec_list,tag_list):
P_total = []
P_num = []
P_deno = []
for i in range(tag_num):
P_total_val = float( Counter(tag_list)[tag_total_list[i]] / len(tag_list) ) #属于第i类的整体概率
P_total.append(P_total_val)
P_num.append(ones(len(sentence_vec_list[0])))
P_deno.append(2.0)
for i in range(len(tag_list)):
for j in range(tag_num):
if tag_list[i] == tag_total_list[j]:
P_num[j] += sentence_vec_list[i] #属于第 i 类的所有词向量的和
P_deno[j] += sum(sentence_vec_list[i]) #属于第 i 类的所有词数的和
P_vec = []
P_dict = {}
for i in range(tag_num):
P_vec.append(log( P_num[i] / P_deno[i] )) #每个词在第 i 类中出现的概率向量
P_dict.update({tag_total_list[i]:{}})
P_dict[tag_total_list[i]].update({"vec":P_vec[i],"total":P_total[i]})
return P_dict
def NB_classfier(input_sentence_vec,P_dict):
P = []
for i in range(tag_num):
P_ans = sum( input_sentence_vec * P_dict[tag_total_list[i]]["vec"] ) + log(P_dict[tag_total_list[i]]["total"])
P.append([P_ans,tag_total_list[i]])
P = sorted(P,key = lambda x:x[0],reverse=True)
sum_p = 0.0 # softmax
for i in range(len(P)):
sum_p += exp(P[i][0])
for i in range(len(P)):
P[i][0] = float(exp(P[i][0]) / sum_p)
return P
def test_NBclassfier():
sentence_list, tag_list = get_data() # 用于训练的所有句子 和 句子的分类
vocab_list = get_vocab(sentence_list) # 所有在训练集中出现的词汇的词表
sentence_vec_list = get_sentence_vec(sentence_list, vocab_list) # 每个句子的词频向量集合
P_dict = train_NBclassfier(sentence_vec_list, tag_list)
with open("NB_vocab_list.txt","a",encoding="utf-8") as f:
for i in range(len(vocab_list)):
f.write(str(vocab_list[i]) + "\n")
with open("NB_car_prob.txt","a",encoding="utf-8") as f:
f.write(str(P_dict))
while True:
print("请输入想要检测的句子:")
input_sentence = input()
input_sentence_vec = get_input_vec(input_sentence,vocab_list)
result = NB_classfier(input_sentence_vec,P_dict)
tag = result[0][1]
for i in range(len(result)):
print("{tag} 的概率是: {score}".format(tag = result[i][1], score = result[i][0]))
print("这句话分类为 {}!!\n".format(tag))
if __name__ == "__main__":
test_NBclassfier()
同样是NBM进行分类,运用的比较多的还有sklearn的是比较经典的
情感分析内核也是朴素贝叶斯的分类,分类的标签是1-5级的情感状态定义,不多家赘述
Hanlp实现文本聚类,内核是K-means(K均值)算法,基本原理概略的说就是不断的寻找质心来进行文本的分类
同样有打包好的进行的调用
from pyhanlp import SafeJClass
ClusterAnalyzer = JClass('com.hankcs.hanlp.mining.cluster.ClusterAnalyzer')
同样基于MSR语料库:https://www.cnblogs.com/mantch/p/12307135.html
sklearn同样也有实现聚类K-means的包,相对用的比较多
from sklearn.cluster import KMeans
词语聚类直接进行,文本的聚类一般要结合tf-idf此类加权,进行向量化后再进行聚类
一般是关键词、新词等的抽取
关键词抽取一般有TF-idf,textrank。打包好的有:
TfIdfCounter = JClass('com.hankcs.hanlp.mining.word.TfIdfCounter')
TextRankKeyword = JClass("com.hankcs.hanlp.summary.TextRankKeyword")
#关键词提取
HanLP.extractKeyword(content, 5)
#短语提取
HanLP.extractPhrase(text, 5)
#自动摘要
TextRankSentence = JClass("com.hankcs.hanlp.summary.TextRankSentence")
sentence_list = HanLP.extractSummary(document, 3)
tf-idf与textrank原理实现(早期学习时候自写的代码)
## TF-idf 关键词提取
import jieba
import jieba.analyse
import re
from collections import Counter
from numpy import *
def load_stop_words():
global stopwords
with open("data/stop_word.txt", "r", encoding="utf-8") as f:
stopwords = f.readlines()
for i in range(len(stopwords)):
stopwords[i] = stopwords[i].replace("\n", "")
def jieba_tf_idf_extract_keyword(corpus_list):
for corpus in corpus_list:
corpus = re.sub(r'[^\w\s]', '', corpus)
corpus = re.sub(r'[0-9]', '', corpus)
keyword = []
for i in range(len(corpus_list)):
keyword.append([])
keywords = jieba.analyse.extract_tags(corpus_list[i], topK=20, withWeight=True, allowPOS=('n', 'nr', 'ns'))
keyword[i].append(keywords)
return keyword
class tf_idf():
@classmethod
def get_vocab_list(cls,corpus_list):
vocab_list = []
vocab_list_uniq = []
for corpus in corpus_list:
corpus = re.sub(r'[^\w\s]', '', corpus)
corpus = re.sub(r'[0-9]', '', corpus)
vocab_list.append(jieba.lcut(corpus,cut_all = False))
load_stop_words()
new_vocab_list = []
#去掉停用词和长度为1的单字(此处需要加上词性判断,如动词不能作为关键词,需要筛掉)
for i in range(len(vocab_list)):
new_vocab_list.append([])
for j in range(len(vocab_list[i])):
if vocab_list[i][j] not in stopwords and len(vocab_list[i][j]) != 1:
new_vocab_list[i].append(vocab_list[i][j])
for i in range(len(new_vocab_list)):
vocab_list_uniq += new_vocab_list[i]
vocab_list_uniq = list(set(vocab_list_uniq))
return new_vocab_list,vocab_list_uniq
@classmethod
# tf,每个词在一个文档中出现的频率
def calculate_tf(cls,vocab_list,vocab_list_uniq):
tf_dic = {}
for i in range(len(vocab_list)):
tf_dic.update({i: {}})
for word in vocab_list_uniq:
tf_of_word = Counter(vocab_list[i])[word] / len(vocab_list[i])
tf_dic[i].update({word:tf_of_word})
return tf_dic
@classmethod
#idf,所有文档的总数 / 每个词在所有文档中出现的频次 用log和分母+1做平滑
def calculate_idf(cls,vocab_list,vocab_list_uniq):
idf_dic = {}
for word in vocab_list_uniq:
word_count = 0
for i in range(len(vocab_list)):
if word in vocab_list[i]:
word_count += 1
idf_of_word = log(len(vocab_list) / (word_count + 1))
idf_dic.update({word:idf_of_word})
return idf_dic
@classmethod
def calculate_tf_idf_val(cls,tf_dic,idf_dic):
tf_idf_dic = {}
for i in range(len(tf_dic)):
tf_idf_dic.update({i:{}})
for word in tf_dic[i].keys():
tf_idf_of_word = tf_dic[i][word] * idf_dic[word]
tf_idf_dic[i].update({word:tf_idf_of_word})
for i in range(len(tf_idf_dic)):
tf_idf_dic[i] = sorted(tf_idf_dic[i].items(),key = lambda x:x[1],reverse=True)
return tf_idf_dic
@classmethod
def extract_keyword(cls,corpus_list,**const):
key_num = const["keyword_num"]
vocab_list,vocab_list_uniq = cls.get_vocab_list(corpus_list)
tf_dic = cls.calculate_tf(vocab_list,vocab_list_uniq)
idf_dic = cls.calculate_idf(vocab_list,vocab_list_uniq)
tf_idf_dic = cls.calculate_tf_idf_val(tf_dic,idf_dic)
jieba_tf_idf_keyword = jieba_tf_idf_extract_keyword(corpus_list)
for i in range(len(tf_idf_dic)):
result = "新闻{num}:\n由自己编写tf-idf方法提取的关键词为:\t".format(num = i +1)
for j in range(key_num):
result += "{} ".format(tf_idf_dic[i][j][0])
print(result)
result_jieba1 = "由结巴自带tf-idf方法提取的关键词为:\t"
for j in range(key_num):
result_jieba1 += "{} ".format(jieba_tf_idf_keyword[i][0][j][0])
print(result_jieba1 + "\n")
if __name__ == "__main__":
with open("data/news_data.txt", "r", encoding="utf-8") as f:
news_list = f.readlines()
for i in range(len(news_list)):
news_list[i] = news_list[i].replace("\n", "")
tf_idf.extract_keyword(news_list,keyword_num = 5)
#textrank 自动摘要
import re
import jieba
import numpy as np
import jieba.analyse
from numpy import *
from collections import Counter
def load_stop_words():
global stopwords
with open("data/stop_word.txt", "r", encoding="utf-8") as f:
stopwords = f.readlines()
for i in range(len(stopwords)):
stopwords[i] = stopwords[i].replace("\n", "")
def cosine_similarity(sentence1,sentence2):
sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
vocab_list = list(set(sen1_vocab_list + sen2_vocab_list))
sen1_vec = np.zeros(len(vocab_list))
sen2_vec = np.zeros(len(vocab_list))
for i in range(len(vocab_list)):
sen1_vec[i] += Counter(sen1_vocab_list)[vocab_list[i]]
sen2_vec[i] += Counter(sen2_vocab_list)[vocab_list[i]]
cos_sim = float(np.sum(sen1_vec * sen2_vec))/(np.linalg.norm(sen1_vec) * np.linalg.norm(sen2_vec))
return cos_sim
def log_similarity(sentence1,sentence2):
sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
if len(sen1_vocab_list) == 1 and len(sen2_vocab_list) == 1:
return 0.0
count = 0
for word in sen1_vocab_list:
if word in sen2_vocab_list:
count += 1
log_sim = count / (log(len(sen1_vocab_list)) + log(len(sen2_vocab_list)))
return log_sim
class GenerateAbstract():
@classmethod
def get_corpus_sentence_list(cls, corpus_list):
punch = r',|/|;|\'|`|<|>|\?|:|\{|\}|\~|!|@|#|\$|%|\^|&|=|\_|\+|,|。|;|【|】|!| |…'
sentence_list = []
for i in range(len(corpus_list)):
sentence_list.append([])
sentence_list[i] = re.split(punch, corpus_list[i])
if "" in sentence_list[i]:
sentence_list[i].remove("")
return sentence_list
@classmethod
def get_abstract(cls,corpus_sentence_list,**const):
cossim_range = const["sim_range"]
iters = const["iters"]
method = const["sim_method"]
page = 1
for sentence_list in corpus_sentence_list:
abstract_num = const["abstract_num"]
l = len(sentence_list)
if l < abstract_num:
abstract_num = l
sen_mat = np.zeros(l* l).reshape(l,l)
for i in range(len(sentence_list)):
for j in range(len(sentence_list)):
if i != j:
if method == "log":
cos_sim = log_similarity(sentence_list[i],sentence_list[j])
elif method == "cos":
cos_sim = cosine_similarity(sentence_list[i], sentence_list[j])
if cos_sim > cossim_range: #句子的余弦相似度在设定值之上,就这两个句子连线
sen_mat[i][j] += cos_sim
PR_mat = np.array(ones(l)).reshape(l,1)
for i in range(iters):
res_mat = 0.15 + 0.85 *sen_mat.dot(PR_mat)
res_dic = {}
for i in range(len(res_mat)):
res_dic.update({sentence_list[i]:float(res_mat[i][0])})
res_dic = sorted(res_dic.items(), key=lambda x: x[1], reverse=True) #PR值越大关键程度越高
abstract_list = []
abstract_str = ""
news_str = ""
for i in range(abstract_num):
abstract_list.append(res_dic[i][0])
for sentence in sentence_list:
if sentence in abstract_list:
abstract_list.remove(sentence)
abstract_str += sentence + "。"
for i in range(l):
if i < l - 1:
news_str += sentence_list[i] + ","
else:
news_str += sentence_list[i] + "。"
print("新闻{num}(本身新闻长度{len_sen},摘要长度{abs_num}):\n原文:\n{news}\n摘要:\n{abstract}\n".
format(num = page,abstract = abstract_str,abs_num = abstract_num,len_sen = len(sentence_list),news = news_str))
page += 1
if __name__ == "__main__":
with open("data/news_data.txt", "r", encoding="utf-8") as f:
news_list = f.readlines()
for i in range(len(news_list)):
news_list[i] = news_list[i].replace("\n", "")
corpus_sentence_list = GenerateAbstract.get_corpus_sentence_list(news_list)
GenerateAbstract.get_abstract(corpus_sentence_list,sim_range = 0.2,iters = 700,abstract_num = 8,sim_method="cos")
哈工大LTP与Hanlp都可以完成自然语言处理的基本任务,分词,词性标注,实体识别,句法分析以及
语义分析等,针对小项的精准程度LTP不及Hanlp。如词性的完整程度,句法分析的复杂程度等等。
在进阶任务中,LTP的一些方法只能对实现起到辅助作用,如分词,词性等,在本人按原理实现的代码里
可以参考,jieba的一些功能可以替换为LTP,同样也可以替换为Hanlp。但是Hanlp在几个进阶任务上都有成熟的
进行好封装的库可以调用,在实际应用中方便很多。