3.指定基因集转换为gmt格式,文章标题词云

第一题:指定基因集转换为gmt格式
Jimmy给的答案已经很高效了,

library(clusterProfiler)
data(gcSample)
names(gcSample)
file="sink-examp.txt"
gs=gcSample
write.gmt <- function(gs,file){
  sink(file)
  lapply(names(gs), function(i){
    cat( paste(c(i,'tmp',gs[[i]]),collapse='\t') )
    cat('\n')
  })
  sink()
}

write.gmt(gs,file)

我写的无非是加了一个标题,改用了for循环

library(clusterProfiler)
data(gcSample)
gcSample
names(gcSample)
write.gmt<-function(gs,file){
  sink(file)
  cat(paste(c("GeneSet","Description","Genes"),collapse = "\t"),"\n")
  names=names(gs)
  for (i in names){
    cat(paste(c(i,"Description",gs[[i]]),collapse = "\t"),"\n")
  }
  sink()
}
file="sink-examp.txt"
write.gmt(gcSample,file)

第二题:词云
我封装了一个函数,用于词云制作

#
library(rvest)
WordMiner=function(keyword="miRNA",n_pages=20){# keyword="immunotherapy"  #选择在pubmed上检索的关键词
  titles=list()
  for (i in 1:n_pages){
    url<-paste0("https://pubmed.ncbi.nlm.nih.gov/?term=",keyword,"&page=",i)
    webpage<-read_html(url)
    title_data_html<-html_nodes(webpage,
                                "a.docsum-title")%>%html_text(trim = T)
    titles[[i]]=title_data_html
  }
  titles=unlist(titles) #准备好要用的titles
  
  # Install
  #install.packages("tm")  # for text mining
  #install.packages("SnowballC") # for text stemming
  #install.packages("wordcloud") # word-cloud generator 
  #install.packages("RColorBrewer") # color palettes
  # Load
  library("tm")
  library("SnowballC")
  library("wordcloud")
  library("RColorBrewer")
  text=titles
  #Load the data as a corpus
  docs <- Corpus(VectorSource(text))
  #Inspect the content of the document
  inspect(docs)
  #Text Transformation
  toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
  docs <- tm_map(docs, toSpace, "/")
  docs <- tm_map(docs, toSpace, "@")
  docs <- tm_map(docs, toSpace, "\\|")
  inspect(docs)
  #Clearning the text
  # Convert the text to lower case
  docs <- tm_map(docs, content_transformer(tolower))
  # Remove numbers
  docs <- tm_map(docs, removeNumbers)
  # Remove english common stopwords
  docs <- tm_map(docs, removeWords, stopwords("english"))
  # Remove your own stop word
  # specify your stopwords as a character vector
  docs <- tm_map(docs, removeWords, c("blabla1", "blabla2")) 
  # Remove punctuations
  docs <- tm_map(docs, removePunctuation)
  # Eliminate extra white spaces
  docs <- tm_map(docs, stripWhitespace)
  # Text stemming
  # docs <- tm_map(docs, stemDocument)
  
  #Build a term-document matrix
  dtm <- TermDocumentMatrix(docs)
  m <- as.matrix(dtm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  head(d, 10)
  
  #Generate the Word cloud
  set.seed(1234)
  wordcloud(words = d$word, freq = d$freq, min.freq = 2,
            max.words=200, random.order=FALSE, rot.per=0.35, 
            colors=brewer.pal(8, "Dark2"))
}

调用函数

WordMiner(keyword = "TP53",n_pages = 20) #选择关键词和挖掘的页码
WordMiner

此外,附上我以前用powerbi制作的词云
https://www.jianshu.com/p/d65bc194797f
https://www.jianshu.com/p/245f0c34691b
以及制作的app
https://app.powerbi.com/view?r=eyJrIjoiODMxNmY0MDAtZDg3YS00YWUwLWJlYjktMDA2YWM4MDY5YTdhIiwidCI6ImUyZmFkYTNhLWZiNjktNGJkZS1hZmE4LWNlM2M2YWU2YjkyYiIsImMiOjZ9

你可能感兴趣的:(3.指定基因集转换为gmt格式,文章标题词云)