处理芯片cel格式数据的全自动R代码

>source(”http://bioconductor.org/biocLite.R”)
>biocLite()
全自动。只需要设置工作路径以及filelist.txt文件即可。


filelist.txt,必须与CEL文件一起放置在工作目录内,文件以空格为间隔,分两列,分别是filename和factor 例


filename factor
MS1_(Mouse430_2).cel control
MS2_(Mouse430_2).cel control
MS3_(Mouse430_2).cel control
MS4_(Mouse430_2).cel beta-catenin
MS5_(Mouse430_2).cel beta-catenin
MS6_(Mouse430_2).cel beta-catenin
MS7_(Mouse430_2).cel TCF4
MS8_(Mouse430_2).cel TCF4
MS9_(Mouse430_2).cel TCF4
MS10_(Mouse430_2).cel Icat
MS11_(Mouse430_2).cel Icat
MS12_(Mouse430_2).cel Icat




source("http://pgfe.umassmed.edu/ou/bioconductor/affyChipAna.R")
setwd("~/Documents/Rscript/testData")
doAll()




更多参数:
dataInput(file=”filelist.txt”,sep=” “,rnames=”filename”)


getExpr(data)


QCreport(data,fastQCfile=”fastQC.pdf”,arrayQMfile=”QCreport”)


getDesign(data)


getContrastMatrix(data,design,constr=NULL)


doAnalyze(data,design,contrast.matrix)


outputData(data,fit2,contrast.matrix,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder=”output”,all=FALSE,scale=”row”,cluster_rows=TRUE,cluster_cols=TRUE)


doAll(file=”filelist.txt”,workingdir=”.”,sep=” “,rnames=”filename”,fastQCfile=”fastQC.pdf”,arrayQMfile=”QCreport”,
constr=NULL,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder=”output”,
all=FALSE,scale=”row”,doQC=TRUE,cluster_rows=TRUE,cluster_cols=TRUE)


可以指定design & contrast.matrix: design and contrast.matrix for limma, coef: report contrast, times: folder-change, cut.fdr.val: P.adj.Val cutoff value


source:
doAll原代码
# source("http://bioconductor.org/biocLite.R")
# biocLite(c("affy","annotate","annaffy","affyQCReport","arrayQualityMetrics","limma","pheatmap"))


dataInput<-function(file="filelist.txt",sep=" ",rnames="filename"){
require(limma,quietly=TRUE)
require(affy,quietly=TRUE)
require(annotate,quietly=TRUE)
targets<-readTargets(file=file,sep=sep,row.names=rnames) #limma
data <- ReadAffy(filenames=targets$filename) #affy
affydb<-annPkgName(data@annotation,type="db")
list("targets"=targets,"data"=data,"affydb"=affydb)
}
getExpr<-function(data){
require(affy,quietly=TRUE)
require(annotate,quietly=TRUE)
require(annaffy,quietly=TRUE)
require(data$affydb,character.only = TRUE,quietly=TRUE)
data$eset<-rma(data$data,verbose=FALSE)
try({
APInfo<-mas5calls(data$data,verbose=FALSE)
present.call<-exprs(APInfo)
})
eset.e<-exprs(data$eset)
if(exists("present.call")){
data$RPMA<-merge(eset.e,present.call,by="row.names")
colnames(data$RPMA)<-make.names(gsub("_\\(.*?\\)\\.CEL","",colnames(data$RPMA),ignore.case=TRUE))
colnames(data$RPMA)<-gsub("\\.y$",".present",colnames(data$RPMA))
colnames(data$RPMA)<-gsub("\\.x$",".rma",colnames(data$RPMA))
}else{
colnames(eset.e)<-paste(colnames(eset.e),"rma",sep=".")
Row.names=rownames(eset.e)
data$RPMA<-cbind(Row.names,eset.e)
data$RPMA<-data.frame(data$RPMA)
}
data$symbols<-as.character(aafSymbol(as.character(data$RPMA$Row.names),data$affydb))
names(data$symbols)<-as.character(data$RPMA$Row.names)
data$genes<-as.character(aafUniGene(as.character(data$RPMA$Row.names),data$affydb))
names(data$genes)<-as.character(data$RPMA$Row.names)
data
}
QCreport<-function(data,fastQCfile="fastQC.pdf",arrayQMfile="QCreport"){
require(affyQCReport,quietly=TRUE)
require(arrayQualityMetrics,quietly=TRUE)
QCReport(data$data,file=fastQCfile)
arrayQualityMetrics(expressionset = data$eset, outdir = paste(arrayQMfile,"RMA",sep="_"),force=T)
arrayQualityMetrics(expressionset = data$data, outdir = paste(arrayQMfile,"Raw",sep="_"),force=T,do.logtransform=T)
eset.e<-exprs(data$eset)
d<-dist(t(eset.e))
hc<-hclust(d,method="complete")
pdf("hclust.complete.pdf")
plot(hc)
dev.off()
cv.y<-numeric()
y.sd<-apply(eset.e,1,sd)
y.mean<-apply(eset.e,1,mean)
cv.y<-y.sd/y.mean
x<-eset.e
y2<-cbind(x,cv.y)
y2<-y2[y2[,ncol(y2)]>0.10,1:(ncol(y2)-1)]
d<-dist(t(y2))
hc<-hclust(d,"ave")
pdf("hclust.ave.pdf")
plot(hc)
dev.off()
}
getDesign<-function(data){
targets<-data$targets
g<-factor(targets[,2])
design<-model.matrix(~-1+g)
if((ncol(targets)>2)){
fo<-"~-1+g"
for(i in 3:ncol(targets)){
assign(paste("v",colnames(targets)[i],sep=""),factor(targets[,i]))
fo<-paste(fo,paste("v",colnames(targets)[i],sep=""),sep="+")
}
design<-model.matrix(as.formula(fo))
colnames(design)<-gsub("^v","",colnames(design))
}
colnames(design)<-gsub("^g","",colnames(design))
colnames(design)<-gsub("-","_",colnames(design))
colnames(design)<-make.names(colnames(design))
design
}
getContrastMatrix<-function(data,design,constr=NULL){
require(limma,quietly=TRUE)
if(is.null(constr)){
g<-factor(data$targets[,2])
design.col.names<-levels(g)
if(length(design.col.names)>2){
for(i in 1:(length(design.col.names)-1)){
for(j in (i+1):length(design.col.names)){
constr<-c(constr,paste(design.col.names[i],design.col.names[j],sep="-"))
}
}
}else{
constr<-paste(design.col.names[2],design.col.names[1],sep="-")
}
}
contrast.matrix<-makeContrasts(contrasts=constr,levels=design)
contrast.matrix
}
doAnalyze<-function(data,design,contrast.matrix){
require(limma,quietly=TRUE)
fit<-lmFit(data$eset,design)
fit1<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit1)
fit2
}
outputData<-function(data,fit2,contrast.matrix,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder="output",all=FALSE,scale="row",cluster_rows=TRUE,cluster_cols=TRUE){
require(limma,quietly=TRUE)
require(pheatmap,quietly=TRUE)
dir.create(outputFolder)
if(file.exists(outputFolder)) setwd(outputFolder)
results<-NULL
results.s<-NULL
constr<-colnames(contrast.matrix)
len<-length(constr)
times<-log(times,2)
symbols<-as.matrix(data$symbols)
colnames(symbols)<-"symbol"
genes<-as.matrix(data$genes)
colnames(genes)<-"gene"
for(i in 1:len){
results[[i]]<-topTable(fit2,coef=i,n=nrow(fit2))
results[[i]]$AveExpr<-NULL
try({
if(colnames(results[[i]])[1]!="ID") results[[i]] <- cbind(ID=rownames(results[[i]]), results[[i]])
index<-grepl("AFFX",results[[i]]$ID)
results[[i]]<-results[[i]][!index,]
})
if(colnames(results[[i]])[1]!="ID") stop("expect first column to be ID")
results.s[[i]]<-results[[i]]
colnames(results.s[[i]])[2:6]<-paste(constr[i],colnames(results.s[[i]])[2:6],sep=".")
}
merge.results.s<-results.s[[1]]
if(len>1){
for(i in 2:len){
merge.results.s<-merge(results.s[[i]],merge.results.s,by="ID")
}
}
all.results.s<-merge(merge.results.s,data$RPMA,by.x="ID",by.y="Row.names")
all.results.s<-merge(genes,all.results.s,by.x="row.names",by.y="ID")
all.results.s<-merge(symbols,all.results.s,by.x="row.names",by.y="Row.names")
colnames(all.results.s)[1]<-"ID"
write.csv(all.results.s,"all.results.csv",row.names=FALSE)
for(i in 1:len){
tmp<-results[[i]]
tmp<-merge(tmp,data$RPMA,by.x="ID",by.y="Row.names")
tmp<-merge(genes,tmp,by.x="row.names",by.y="ID")
tmp<-merge(symbols,tmp,by.x="row.names",by.y="Row.names")
colnames(tmp)[1]<-"ID"
tmp<-tmp[tmp$P.Value if(!is.null(cut.fdr.val)) tmp<-tmp[tmp$adj.P.Val up<-tmp[tmp$logFC>times,]
lo<-tmp[tmp$logFC<(-times),]
up<-up[order(-up$logFC),]
lo<-lo[order(lo$logFC),]
write.csv(up,paste("up",constr[i],"csv",sep="."),row.names=FALSE)
write.csv(lo,paste("lo",constr[i],"csv",sep="."),row.names=FALSE)
if(all) selected<-data$RPMA[data$RPMA$Row.names %in% c(up$ID,lo$ID),]
else selected<-rbind(up,lo)
if(nrow(selected)>2){
try({
colnames(selected)[1]<-"ID"
sname<-symbols[selected$ID,"symbol"]
sname[sname=="character(0)"]<-"---"
fmt<-paste("%-",max(nchar(sname)),"s",sep="")
bname<-sprintf(fmt,sname)
sname<-paste(bname,names(sname),sep=" ")
rownames(selected)<-sname
selected<-selected[,grepl(".rma$",colnames(selected))]
colnames(selected)<-gsub(".rma$","",colnames(selected))
if(all){
annotation<-data$targets[,2:ncol(data$targets)]
rownames(annotation)<-make.names(gsub("_\\(.*?\\)\\.CEL","",rownames(annotation),ignore.case=TRUE))
}else{
gp<-unlist(strsplit(constr[i],"-"))
gp<-gsub("[\\(\\)]","",gp)
targets<-data$targets[data$targets$factor %in% gp,]
selcol<-make.names(gsub("_\\(.*?\\)\\.CEL","",rownames(targets),ignore.case=TRUE))
selected<-selected[,selcol]
annotation<-targets[,2:ncol(targets),drop=FALSE]
rownames(annotation)<-selcol
}
pdf(paste("heatmap",constr[i],"pdf",sep="."),width=8+ncol(selected)/10,height=2+nrow(selected)/10)
pheatmap(selected, annotation=annotation, scale=scale, fontsize=9, fontsize_row=6, fontfamily="mono", cluster_rows=cluster_rows, cluster_cols=cluster_cols)
dev.off()
})
}
}
}


doAll<-function(file="filelist.txt",workingdir=".",sep=" ",rnames="filename",fastQCfile="fastQC.pdf",arrayQMfile="QCreport",
constr=NULL,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder="output",
all=FALSE,scale="row",doQC=TRUE,cluster_rows=TRUE,cluster_cols=TRUE){
setwd(workingdir)
cat("Data input\n")
data<-dataInput(file,sep,rnames)
cat("Get Expression\n")
data<-getExpr(data)
if(doQC){
cat("Do QC\n")
QCreport(data,fastQCfile,arrayQMfile)
}
cat("Do limma analysis\n")
design<-getDesign(data)
contrast.matrix<-getContrastMatrix(data,design,constr)
fit2<-doAnalyze(data,design,contrast.matrix)
cat("Data output\n")
outputData(data,fit2,contrast.matrix,times,cut.p.val,cut.fdr.val,outputFolder,all,scale,cluster_rows,cluster_cols)
cat("..Done..\n")
}














#从GEO上下载表达谱数据
library(GEOquery)
gset<-getGEO("GSE*****",GSEMatrix=TRUE)
length(gset)
#load NCBI platform annotation
gpl<-annotation(gset)
platf<-getGEO(gpl,AnnotGPL=TRUE)
ncbifd <- data.frame(attr(dataTable(platf), "table"))
eset <- exprs(gset)
head(eset[,1:5])
          GSM1123782 GSM1123783 GSM1123784 GSM1123785 GSM1123786
1007_s_at    10.1689    10.5247    10.8179    10.3539    10.4964
1053_at       9.6002     7.9436     9.8653     9.9733    10.1960
117_at        5.6808     5.0301     3.7654     2.7751     2.8179
121_at        4.2268     4.6148     4.6147     4.4977     4.6147
1255_g_at     2.1869     2.1869     2.1869     2.1869     2.1869
1294_at       2.1874     2.1874     2.1874     2.1874     2.1874
> head(ncbifd[,1:5])
         ID                                  Gene.title Gene.symbol Gene.ID UniGene.title
1 1007_s_at discoidin domain receptor tyrosine kinase 1        DDR1     780          
2   1053_at replication factor C (activator 1) 2, 40kDa        RFC2    5982          
3    117_at        heat shock 70kDa protein 6 (HSP70B')       HSPA6    3310          
4    121_at                                paired box 8        PAX8    7849          
5 1255_g_at     guanylate cyclase activator 1A (retina)      GUCA1A    2978          
6   1294_at ubiquitin-like modifier activating enzyme 7        UBA7    7318          


#从GEO上下载cel文件
library(GEOquery)
> getGEOSuppFiles("GSE46106")
[1] "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl/"
trying URL 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl//GSE46106_RAW.tar'
ftp data connection made, file length 399247360 bytes
opened URL
===============================================
downloaded 380.8 Mb
 
trying URL 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl//filelist.txt'
ftp data connection made, file length 3308 bytes
opened URL
==================================================
downloaded 3308 bytes
> setwd("GSE46106/")
> dir()
[1] "filelist.txt"     "GSE46106_RAW.tar"
> untar("GSE46106_RAW.tar")
> files <- dir(pattern="gz$")
> sapply(files, gunzip)
GSM1123782_2147TG11_U133plus2.CEL.gz GSM1123783_2147TG15_U133plus2.CEL.gz  GSM1123784_2147TG1_U133plus2.CEL.gz 
                            33380229                             32529207                             33390958 
 GSM1123785_2147TG6_U133plus2.CEL.gz  GSM1123786_2277TG1_U133plus2.CEL.gz  GSM1123787_2277TG9_U133plus2.CEL.gz 
                            33210730                             33082826                             33531908 
GSM1123788_2665TG15_U133plus2.CEL.gz  GSM1123789_2665TG1_U133plus2.CEL.gz  GSM1123790_2665TG6_U133plus2.CEL.gz 
                            32232774                             33380782                             33432878 
 GSM1123791_3104TG1_U133plus2.CEL.gz  GSM1123792_3104TG3_U133plus2.CEL.gz  GSM1123793_3107TG1_U133plus2.CEL.gz 
                            32850644                             32588606                             32584342 
 GSM1123794_3107TG5_U133plus2.CEL.gz  GSM1123795_3143TG1_U133plus2.CEL.gz  GSM1123796_3143TG5_U133plus2.CEL.gz 
                            33358932                             32369391                             32806108 
 GSM1123797_3204TG1_U133plus2.CEL.gz  GSM1123798_3204TG5_U133plus2.CEL.gz  GSM1123799_3561TG1_U133plus2.CEL.gz 
                            32557782                             33016324                             32883029 
 GSM1123800_3561TG5_U133plus2.CEL.gz  GSM1123801_3611TG1_U133plus2.CEL.gz  GSM1123802_3611TG5_U133plus2.CEL.gz 
                            33454274                             32657648                             33126603 
 GSM1123803_3613TG1_U133plus2.CEL.gz  GSM1123804_3613TG5_U133plus2.CEL.gz  GSM1123805_3807TG1_U133plus2.CEL.gz 
                            32581902                             33193826                             32791917 
 GSM1123806_3807TG5_U133plus2.CEL.gz  GSM1123807_3824TG2_U133plus2.CEL.gz  GSM1123808_3887TG1_U133plus2.CEL.gz 
                            33266865                             33078422                             32479022 
 GSM1123809_3887TG5_U133plus2.CEL.gz  GSM1123810_3904TG1_U133plus2.CEL.gz  GSM1123811_3904TG5_U133plus2.CEL.gz 
                            33291287                             32645537                             33425796 
 GSM1123812_3936TG1_U133plus2.CEL.gz  GSM1123813_3936TG5_U133plus2.CEL.gz  GSM1123814_3963TG1_U133plus2.CEL.gz 
                            32939923                             33317196                             32725410 
 GSM1123815_3963TG5_U133plus2.CEL.gz  GSM1123816_4013TG1_U133plus2.CEL.gz  GSM1123817_4169TG2_U133plus2.CEL.gz 
                            33276082                             33151406                             33167461 
 GSM1123818_4175TG1_U133plus2.CEL.gz  GSM1123819_4272TG1_U133plus2.CEL.gz  GSM1123820_4400TG1_U133plus2.CEL.gz 
                            33139829                             33111635                             33206021 
 GSM1123821_4400TG5_U133plus2.CEL.gz  GSM1123822_4664TG1_U133plus2.CEL.gz  GSM1123823_4849TG1_U133plus2.CEL.gz 
                            33026806                             33364390                             33300210 
 GSM1123824_4888TG1_U133plus2.CEL.gz  GSM1123825_4913TG1_U133plus2.CEL.gz 
                            33299396                             33207528 
> filelist <- dir(pattern="CEL$")
> library(affy)
> library(annotate)
> data <- ReadAffy(filenames=filelist)
> affydb<-annPkgName(data@annotation,type="db")
> require(affydb, character.only=TRUE)
> eset<-rma(data,verbose=FALSE)
> eset.e <- exprs(eset)
> library(annaffy)
> symbols<-as.character(aafSymbol(as.character(rownames(eset)),affydb))
> genes<-as.character(aafUniGene(as.character(rownames(eset)),affydb))

你可能感兴趣的:(R,learning,data,mining)