R软件中jiebaR包分词和用python中jieba分词以及做关键字提取及LDA模型

对于一个软件来讲,若是开源其发展速度是很快的,在R软件中,去年年底就发布了jiebaR分词包,记得上学的那会jieba包首先是出现在python中,没想到在R软件中也用的到,前几天接了点私活,用这个包帮他做点东西出来,没想到,做到最后不愿意给钱,无良奸商。。。不过也正好也熟悉了一下R中的jiebaR分词包,总体来讲这个包还是蛮强大的,中文分词很准确,能提取关键字,能快速的上手,直接上代码对比python中jieba包,看看吧:


library(jiebaRD)
library(jiebaR)
# library(jiebaR)加载包时没有启动任何分词引擎启动引擎很简单,就是一句赋值语句就可以了
x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"
cutter=worker()
cutter<=x
show_dictpath() #显示字典路径
edit_dict() #编辑用户字典 可以自己加词语进去 要重新cutter=worker()才能生效
cutter <= "D:\\Users\\xuguoxuan694\\Desktop\\新建文本文档.txt"
show_dictpath() ### 可以显示默认词典路径
segment(code= x , jiebar = cutter) ##一样的功能
cutter=worker(type="tag")
tryCatch(library(jiebaR),error=function(e){
install.packages("jiebaR")

},finally={
  tryCatch(library(jiebaR)
  ,error=function(e){print("请重新安装jiebaR包")})
}
)
tryCatch(library(ggplot2),error=function(e){
install.packages("ggplot2")

},finally={
  tryCatch(library(ggplot2)
  ,error=function(e){print("请重新安装ggplot2包")})
}
)
tryCatch(library(wordcloud),error=function(e){
install.packages("wordcloud")

},finally={
  tryCatch(library(wordcloud)
  ,error=function(e){print("请重新安装wordcloud包")})
}
)

result<-read.csv("C:\\Users\\Administrator\\Desktop\\质量记录.csv")
head(result)
# edit_dict()
cutter=worker()
x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"  
cutter<=x 

result$QUALITYDESC_d<-sapply(result$QUALITYDESC,function(x)gsub("[a-zA-Z0-9]","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户反应","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("反映","",x))

#把全身没气压、全身气压弱、全身气压很小转化为全身气压问题
clear_stopwords<-function(x){
   if(grepl("全身没气压",x)){
    x<-gsub("全身没气压","全身气压",x)
   }
    if(grepl("全身不充气",x)){
   x<-gsub("全身不充气","全身充气",x)
   }
    if(grepl("断了",x)){
   x<-gsub("断了","断裂",x)
   }
   if(grepl("响声",x)){
   x<-gsub("响声","异响",x)}
  x
}
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,clear_stopwords)
jieba_result<-c()
for(j in result$QUALITYDESC_d){
  jieba_result<-c(jieba_result,cutter<=j)
}
stopwords1<-c("不能","不","了","有","在","没","少","一个","都","也","时","来","用","会","上","后","是","腿","走","无","左","大","没有","就","到","右","坏"," 部","不会"," 两个")
stopword2<-c(stopwords1,"加","一","小","个","才","去","能","对","只","还","和","需要","过","倒","的","跟","已","掉","让","可以","掉","停","拨","亮","一下","下")
stopword<-c(stopword2,"其他","下去","时候","使用","问题","正常","部","一边","一直","工作","响","说","好","买","但是","一样","不行","时有","夹")
jjj_result<-as.data.frame(table(jieba_result))
jj_result<-jjj_result[!jjj_result$jieba_result %in% stopword, ]
op<-par(bg = "lightyellow")  

wordcloud(jj_result$jieba_result,jj_result$Freq,col = rainbow(length(jj_result$Freq)),scale=c(5,1),min.freq=4,max.words=Inf,random.order=FALSE)  
par(op)  
last<-jj_result[order(jj_result$Freq),]
p<-ggplot(tail(last,30),aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30名分析报告分析")+ geom_text(label=(tail(last,30))$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))


p1<-ggplot( last[(nrow(last)-60):(nrow(last)-30),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p1+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
  xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30到60名分析报告分析")+ geom_text(label=(last[(nrow(last)-60):(nrow(last)-30),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))

p2<-ggplot( last[(nrow(last)-90):(nrow(last)-60),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p2+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前60到90名分析报告分析")+geom_text(label=(last[(nrow(last)-90):(nrow(last)-60),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))


cutter_words <- cutter <= "我爱北京天安门"# 关键词提取# 关键词提取所使用逆向文件频率(IDF)。文本语料库可以切换成自定义语料库的路径,使用方法与分词类似。topn参数为关键词的个数。cutter = worker(type = "keywords", topn = 2)cutter_words <- cutter <= "我爱北京天安门"cutter_words
 
  

https://qinwf.shinyapps.io/jiebaR-shiny/    jiebaR在线分词试用

附上python中关键词提取及LDA模型 的python代码

#encoding:utf-8
'''
Created on 2015年10月25日

@author: Administrator
'''
import pandas  as pd
import re
import jieba  
import nltk  
import jieba.posseg as pseg  
from gensim import corpora, models, similarities
df=pd.read_csv(u'C:\\Users\\Administrator\\Desktop\\质量记录.csv',encoding='gbk')
cont=df['QUALITYDESC'].map(lambda x:re.sub(ur'客户反应|客户|反映','',x)).map(lambda x:re.sub(r'[a-zA-Z0-9\.]','',x))
#导入自己添加的用户词语
jieba.load_userdict(u'C:\\Users\\Administrator\\Desktop\\分词.txt')

nwordall = []  
for t in cont:  
        words =pseg.cut(t)  
        nword = ['']  
        for w in words:  
            if((w.flag == 'n'or w.flag == 'v' or w.flag == 'a') and len(w.word)>1):  
                nword.append(w.word)  
        nwordall.append(nword) 
# 选择后的词生成字典  
dictionary = corpora.Dictionary(nwordall)#用于生成字典类似与table,Counter模块中count
    #print dictionary.token2id  
# 生成语料库   
corpus = [dictionary.doc2bow(text) for text in nwordall]
 #tfidf加权  
tfidf = models.TfidfModel(corpus)  
# print tfidf.dfsx  
# print tfidf.idf  
corpus_tfidf = tfidf[corpus]

 # 4. 主题模型lda,可用于降维  
#lda流式数据建模计算,每块10000条记录,提取50个主题  
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50,     update_every=1, chunksize=10000, passes=1)  
#提取前面20个主题
for i in range(0,20):  
    print lda.print_topics(i)[0]  

#lda全部数据建模,提取100个主题  
#lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100, update_every=0, passes=20)  
#利用原模型预测新文本主题  
#doc_lda = lda[corpus_tfidf]

 
#5. word2vec 词向量化,可用于比较词相似度,寻找对应关系,词聚类  
#sentences = models.word2vec.LineSentence(nwordall)  
#size为词向量维度数,windows窗口范围,min_count频数小于5的词忽略,workers是线程数  
model = models.word2vec.Word2Vec(nwordall, size=100, window=5, min_count=5, workers=4)
print model[u'指示灯']  
#向量表示  
sim = model.most_similar(positive=[u'指示灯', u'灯不亮'])  
    #相近词  
for s in sim:  
    print "word:%s,similar:%s " %(s[0],s[1])







你可能感兴趣的:(python编程,R语言,机器学习)