jiebaR - 中文分词

http://qinwenfeng.com/jiebaR/


library(jiebaR)
wkr = worker()
segment("今天天气好晴朗", wkr)

library(jiebaR)
library(sqldf)

TA = read.csv('R/table-A.csv', header = TRUE, sep = ",")

txtdf = TA$BAK_TXT
TA$BAK_TXT <- as.character(TA$BAK_TXT)

wkr = worker()

# vector
words = c()
for( txt in txtdf ){
    # add new segment into words
    words <- c(words, segment(txt, wkr) )
}

#jieba functions
freqrs <- freq(words)

# sort and count
rs <- table(words)
# convert to data frame
rsdf <- as.data.frame(rs)

rsdf$words <- as.character(rsdf$words)

lowChar <- grep("[a-z]", rsdf$words)
upperChar <- grep("[A-Z]", rsdf$words)
numbers <- grep("[0-9]", rsdf$words)
# check encoding
#Encoding( rsdf$words  )

rowNums <- c(lowChar, upperChar, numbers)

# delete duplicate row numbers
rowNums <- unique(rowNums)

# selelct none char/number rows
chrs <- rsdf[-rowNums,]

# check the length of string
#nchar(chrs$words)

nwords <- dim(chrs)[1]
nrow = dim(TA)[1]

for( i in 1:nwords ){
  word <- chrs$words[i]

  wordCols <- character()
  for( j in 1:nrow ){

    flg <- grepl(word, TA$BAK_TXT[j])

    if( flg == TRUE ){
      wordCols <- c( wordCols, "Y" )
    }else{
      wordCols <- c( wordCols, "N" )
    }
  }

  wordCols <- as.data.frame(wordCols)
  names(wordCols) <- word

  TA <- cbind(TA, wordCols)

}

write.csv(TA, file = "rs-words.csv")

https://www.r-bloggers.com/r-function-of-the-day-table/

你可能感兴趣的:(R语言)