对于一个软件来讲,若是开源其发展速度是很快的,在R软件中,去年年底就发布了jiebaR分词包,记得上学的那会jieba包首先是出现在python中,没想到在R软件中也用的到,前几天接了点私活,用这个包帮他做点东西出来,没想到,做到最后不愿意给钱,无良奸商。。。不过也正好也熟悉了一下R中的jiebaR分词包,总体来讲这个包还是蛮强大的,中文分词很准确,能提取关键字,能快速的上手,直接上代码对比python中jieba包,看看吧:
library(jiebaRD)
library(jiebaR)
# library(jiebaR)加载包时没有启动任何分词引擎启动引擎很简单,就是一句赋值语句就可以了
x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"
cutter=worker()
cutter<=x
show_dictpath() #显示字典路径
edit_dict() #编辑用户字典 可以自己加词语进去 要重新cutter=worker()才能生效
cutter <= "D:\\Users\\xuguoxuan694\\Desktop\\新建文本文档.txt" show_dictpath() ### 可以显示默认词典路径 segment(code= x , jiebar = cutter) ##一样的功能 cutter=worker(type="tag")
tryCatch(library(jiebaR),error=function(e){ install.packages("jiebaR") },finally={ tryCatch(library(jiebaR) ,error=function(e){print("请重新安装jiebaR包")}) } ) tryCatch(library(ggplot2),error=function(e){ install.packages("ggplot2") },finally={ tryCatch(library(ggplot2) ,error=function(e){print("请重新安装ggplot2包")}) } ) tryCatch(library(wordcloud),error=function(e){ install.packages("wordcloud") },finally={ tryCatch(library(wordcloud) ,error=function(e){print("请重新安装wordcloud包")}) } ) result<-read.csv("C:\\Users\\Administrator\\Desktop\\质量记录.csv") head(result) # edit_dict() cutter=worker() x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利" cutter<=x result$QUALITYDESC_d<-sapply(result$QUALITYDESC,function(x)gsub("[a-zA-Z0-9]","",x)) result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户反应","",x)) result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户","",x)) result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("反映","",x)) #把全身没气压、全身气压弱、全身气压很小转化为全身气压问题 clear_stopwords<-function(x){ if(grepl("全身没气压",x)){ x<-gsub("全身没气压","全身气压",x) } if(grepl("全身不充气",x)){ x<-gsub("全身不充气","全身充气",x) } if(grepl("断了",x)){ x<-gsub("断了","断裂",x) } if(grepl("响声",x)){ x<-gsub("响声","异响",x)} x } result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,clear_stopwords) jieba_result<-c() for(j in result$QUALITYDESC_d){ jieba_result<-c(jieba_result,cutter<=j) } stopwords1<-c("不能","不","了","有","在","没","少","一个","都","也","时","来","用","会","上","后","是","腿","走","无","左","大","没有","就","到","右","坏"," 部","不会"," 两个") stopword2<-c(stopwords1,"加","一","小","个","才","去","能","对","只","还","和","需要","过","倒","的","跟","已","掉","让","可以","掉","停","拨","亮","一下","下") stopword<-c(stopword2,"其他","下去","时候","使用","问题","正常","部","一边","一直","工作","响","说","好","买","但是","一样","不行","时有","夹") jjj_result<-as.data.frame(table(jieba_result)) jj_result<-jjj_result[!jjj_result$jieba_result %in% stopword, ] op<-par(bg = "lightyellow") wordcloud(jj_result$jieba_result,jj_result$Freq,col = rainbow(length(jj_result$Freq)),scale=c(5,1),min.freq=4,max.words=Inf,random.order=FALSE) par(op) last<-jj_result[order(jj_result$Freq),] p<-ggplot(tail(last,30),aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1) p+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+ xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30名分析报告分析")+ geom_text(label=(tail(last,30))$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result))) p1<-ggplot( last[(nrow(last)-60):(nrow(last)-30),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1) p1+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+ xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30到60名分析报告分析")+ geom_text(label=(last[(nrow(last)-60):(nrow(last)-30),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result))) p2<-ggplot( last[(nrow(last)-90):(nrow(last)-60),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1) p2+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+ xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前60到90名分析报告分析")+geom_text(label=(last[(nrow(last)-90):(nrow(last)-60),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))
cutter_words <- cutter <= "我爱北京天安门"# 关键词提取# 关键词提取所使用逆向文件频率(IDF)。文本语料库可以切换成自定义语料库的路径,使用方法与分词类似。topn参数为关键词的个数。cutter = worker(type = "keywords", topn = 2)cutter_words <- cutter <= "我爱北京天安门"cutter_wordshttps://qinwf.shinyapps.io/jiebaR-shiny/ jiebaR在线分词试用
附上python中关键词提取及LDA模型 的python代码
#encoding:utf-8 ''' Created on 2015年10月25日 @author: Administrator ''' import pandas as pd import re import jieba import nltk import jieba.posseg as pseg from gensim import corpora, models, similarities df=pd.read_csv(u'C:\\Users\\Administrator\\Desktop\\质量记录.csv',encoding='gbk') cont=df['QUALITYDESC'].map(lambda x:re.sub(ur'客户反应|客户|反映','',x)).map(lambda x:re.sub(r'[a-zA-Z0-9\.]','',x)) #导入自己添加的用户词语 jieba.load_userdict(u'C:\\Users\\Administrator\\Desktop\\分词.txt') nwordall = [] for t in cont: words =pseg.cut(t) nword = [''] for w in words: if((w.flag == 'n'or w.flag == 'v' or w.flag == 'a') and len(w.word)>1): nword.append(w.word) nwordall.append(nword) # 选择后的词生成字典 dictionary = corpora.Dictionary(nwordall)#用于生成字典类似与table,Counter模块中count #print dictionary.token2id # 生成语料库 corpus = [dictionary.doc2bow(text) for text in nwordall] #tfidf加权 tfidf = models.TfidfModel(corpus) # print tfidf.dfsx # print tfidf.idf corpus_tfidf = tfidf[corpus] # 4. 主题模型lda,可用于降维 #lda流式数据建模计算,每块10000条记录,提取50个主题 lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50, update_every=1, chunksize=10000, passes=1) #提取前面20个主题 for i in range(0,20): print lda.print_topics(i)[0] #lda全部数据建模,提取100个主题 #lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100, update_every=0, passes=20) #利用原模型预测新文本主题 #doc_lda = lda[corpus_tfidf] #5. word2vec 词向量化,可用于比较词相似度,寻找对应关系,词聚类 #sentences = models.word2vec.LineSentence(nwordall) #size为词向量维度数,windows窗口范围,min_count频数小于5的词忽略,workers是线程数 model = models.word2vec.Word2Vec(nwordall, size=100, window=5, min_count=5, workers=4) print model[u'指示灯'] #向量表示 sim = model.most_similar(positive=[u'指示灯', u'灯不亮']) #相近词 for s in sim: print "word:%s,similar:%s " %(s[0],s[1])