上次salmon做拟南芥转录组分析https://www.jianshu.com/p/af513c3906f1。由于上游分析中mapping为~15%,下游分析时勉强走完差异表达基因及火山图,但不能进行基因功能注释及后续分析。所以将上游分析流程再走一次。
不同:第一次上游分析使用:
Arabidopsis_thaliana.TAIR10.42.gff3.gz
Arabidopsis_thaliana.TAIR10.42.gtf.gz
Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz
Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
将参考基因组换为:
Arabidopsis_thaliana.TAIR10.28.cdna.all.fa.gz
Arabidopsis_thaliana.TAIR10.28.dna.genome.fa.gz
Arabidopsis_thaliana.TAIR10.28.gff3.gz
Arabidopsis_thaliana.TAIR10.28.gtf.gz
参考基因组下载
nohup wget ftp://ftp.ensemblgenomes.org/pub/plants/release-28/fasta/arabidopsis_thaliana/cdna/Arabidopsis_thaliana.TAIR10.28.cdna.all.fa.gz &
nohup wget ftp://ftp.ensemblgenomes.org/pub/plants/release-28/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.28.dna.genome.fa.gz &
nohup wget ftp://ftp.ensemblgenomes.org/pub/plants/release-28/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.28.gff3.gz &
nohup wget ftp://ftp.ensemblgenomes.org/pub/plants/release-28/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.28.gtf.gz &
创建索引
salmon index -t Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz -I athal_index_salmon
定量分析
cat >quant_salmon2.sh
quant=/Users/wudandan/project/rna/raw/quant_salmon
index=/Users/wudandan/project/rna/raw/athal_index_salmon
cat sample.txt|while read sample;do echo "processing sample ${sample}";
salmon quant -i $index -l A -1 ${sample}_1.fastq.gz -2 ${sample}_2.fastq.gz -o $quant/${sample}_quant ${sample} 1>${sample}.log 2>&1;done
:wq
nonhup bash quant_salmon2.sh &
下游分析
rm(list=ls())
options(stringsAsFactors = F)
getwd()
dir=getwd()
files=list.files(pattern = "*sf",dir,recursive = T)
files=file.path(dir,files)
all(file.exists(files))
加载数据包出现报错
library(AnnotationHub)
#此报错不定期出现,有时加载很多次都会出现如图ah1的报错,可能是网络原因,ah2位加载成功,
ah <- AnnotationHub()
#查找包中拟南芥的数据
ath <- query(ah,'Arabidopsis thaliana')
#下载最新的注释ID,此步骤会出现ah一步类似的报错,多试几次
ath_tx <- ath[['AH52247']]
为得到注释包信息,可以查好后直接下载
BiocManager::install("TxDb.Athaliana.BioMart.plantsmart28", version = "3.8")
library(TxDb.Athaliana.BioMart.plantsmart28)
ls('package:TxDb.Athaliana.BioMart.plantsmart28')
a=TxDb.Athaliana.BioMart.plantsmart28
# 查看包有哪些列
columns(a)
# keys返回这个数据包可以当作关键字查找的列,
# keytypes返回的列等于或少于columns返回的结果,不是所有的列都可以当作对象查找
k=keys(a,keytype = "GENEID")
# select可以根据你提供的key取查找注释数据库,返回你需要的columns信息
df=select(a, keys=k, keytype = "GENEID",columns = "TXNAME")
# 检查得到的矩阵,得到列名为“GENEID”和“TXNAME”的两列
head(df)
# 将“TXNAME”放在第一列,“GENEID”放在第二列
tx2gene=df[,2:1]
head(tx2gene)
准备表达矩阵
library('tximport')
library('readr')
txi=tximport(files ,type = "salmon",tx2gene = tx2gene)
names(txi)
head(txi$counts)
colnames(txi$counts)= paste0("ERR1698",194:209)
head(txi$counts)
tmp=txi$counts
head(tmp)
exprSet=apply(tmp,2,as.integer)
head(exprSet)
rownames(exprSet)=rownames(tmp)
head(exprSet)
dim(exprSet)
样本信息
sampleTable=read.csv("sampleTable.txt",sep="\t",header = FALSE)
colnames(sampleTable)=c("sample","group_list")
rownames(sampleTable)=sampleTable[,1]
sampleTable=sampleTable[,-1,drop=FALSE]
sampleTable$group_list=paste0("day",sampleTable$group_list)
names(txi)
head(txi$counts)
head(txi$length)
colnames(txi$length)=colnames(txi$counts)
colnames(txi$abundance)=colnames(txi$counts)
txi$counts[1:4,1:4]
构建矩阵
library('DESeq2')
dds<-DESeqDataSetFromTximport(txi,sampleTable,design = ~group_list)
suppressMessages(dds2 <- DESeq(dds))
均一化
rld=rlogTransformation(dds2)
exprSet_new=assay(rld)
表达矩阵
resultsNames(dds2)
res0vs1= results(dds2,contrast = c("group_list","day1","day0"))
resOrdered1=res0vs1[order(res0vs1$padj),]
resOrdered1=as.data.frame(resOrdered1)
head(resOrdered1)
library("org.At.tair.db")
library("KEGG.db")
library("clusterProfiler")
library('ggplot2')
resOrdered1$gene_id=rownames(resOrdered1)
id2symbol=toTable(org.At.tairSYMBOL)
resOrdered1=merge(resOrdered1,id2symbol,by='gene_id')
修改名称,不变后面代码
DEG=resOrdered1
colnames(DEG)=colnames(DEG)=c('gene_id' ,'baseMean','logFC','lfcSE','stat','pvalue' , 'P.Value' , 'symbol')
DEG过滤,基因名去除空值
DEG$symbol=as.character(DEG$symbol)
DEG_filter=DEG[nchar(DEG$symbol)>1,]
DEG_filter=DEG_filter[!is.na(DEG_filter$symbol),]
火山图
logFC_Cutof=with(DEG_filter,mean(abs( logFC)) + 2*sd(abs( logFC)))
logFC_Cutof=0
DEG_filter$change=as.factor(ifelse(DEG_filter$P.Value<0.05 & abs(DEG_filter$logFC)>logFC_Cutof,ifelse(DEG_filter$logFC>logFC_Cutof,'UP','DOWN'),'NOT'))
this_tile <- paste0('Cutoff for logFC is ',round(logFC_Cutof,3),'\nThe number of up gene is ',nrow(DEG_filter[DEG_filter$change =='UP',]) ,'\nThe number of down gene is ',nrow(DEG_filter[DEG_filter$change =='DOWN',]))
g_volcano=ggplot(data=DEG_filter,aes(x=logFC, y=-log10(P.Value),color=change))+
geom_point(alpha=0.4, size=1.75)+
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) +
theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red'))
print(g_volcano)