RNA-seq分析流程

1、质量控制:

fastqc -o ./ -t 1 /public/home/zyhu/hfd/${i}_R1_001.fastq.gz /public/home/zyhu/hfd/${i}_R2_001.fastq.gz
合并质控结果
multiqc ./
Trimmomatic修剪去掉前端低质量碱基
java -jar \$EBROOTTRIMMOMATIC/trimmomatic-0.32.jar PE -phred33 ${i}_R1_001.fastq.gz ${i}_R2_001.fastq.gz \
${i}_R1_001_paired.fastq.gz ${i}_R1_001_unpaired.fastq.gz ${i}_R2_001_paired.fastq.gz ${i}_R2_001_unpaired.fastq.gz \
LEADING:3 TRAILING:3 HEADCROP:10 SLIDINGWINDOW:4:15 MINLEN:36

2、比对到参考基因组(hisat2)

hisat2 -p 4 --dta -x /public/home/zyhu/Mouse/reference/GRCm39 -1 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R1_001_paired.fastq.gz -2 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R2_001_paired.fastq.gz -S ${i}.sam

3、featureCounts计数(meta层面)

featureCounts -T 4 -a /public/home/zyhu/Mouse/annotation/Mus_musculus.GRCm39.103.gtf -o /public/home/zyhu/Mouse_RNA-seq/06.featureCounts/meta-featureCounts/featureCounts.txt -p -B -C -t exon -g gene_id *.bam
提取基因名和reads count
cut -f 1,7-41 featureCounts.txt|grep -v '#' > A_B_C_M.matrix.txt

4、差异分析

rm(list=ls())
options(stringsAsFactors = F)
library('rio')
library('ggplot2')
library('DESeq2')
library('pheatmap')
library('apeglm')
CountMatrix <- read.table("C:/Users/Cool-N/OneDrive/桌面/A_B_C_M.matrix.txt",header = T,row.names= 1)
colnames(CountMatrix) <- c("A_10_S37","A_1_S30","A_2_S31","A_3_S32","A_4_S33","A_6_S34","A_7_S35","A_9_S36","B_1_S38","B_2_S39","B_3_S40","B_4_S41","B_5_S42","B_6_S43","B_7_S44","B_9_S45","C_2_S46","C_3_S47","C_4_S48","C_5_S49","C_6_S50","C_7_S51","C_8_S52","C_9_S53","M_2_S54","M_3_S55","M_4_S56","M_5_S57","M_6_S58","M_7_S59","M_8_S60","M_9_S61")
countdata = CountMatrix[,c(1:8,17:24)]
#保留非全0的基因
countdata <- countdata[rowSums(countdata) > 0,]
condition = factor(c(rep("A", 8), rep("C", 8)), levels = c("A","C"))
coldata = data.frame(row.names = colnames(countdata), condition)
dds = DESeqDataSetFromMatrix(countdata, colData = coldata, design = ~ condition)

#PCA分析--其他包
# library(ggord)
# library(FactoMineR)
# dat <- as.data.frame(t(countdata))
# dat.pca <- PCA(dat, graph = FALSE)
# ggord(dat.pca,group_list,coord_fix = FALSE,
#      arrow = 0,vec_ext = 0,txt = NULL)
# ggsave("PCA.pdf")


#PCA--使用DESeq2的plotPCA包;样本数<30:rlog; >30:vst(归一化)
rld<-rlog(dds,blind=FALSE)
assay(rld) <- limma::removeBatchEffect(assay(rld), rld$condition)#去除批次效应,可选项
pdf("PCA_plot.pdf")
plotPCA(rld,intgroup = "condition")
dev.off()


#差异表达分析
dds = DESeq(dds)
#sizeFactors(dds)
res <- results(dds)
res <- res[order(res$padj),]
#table(res$padj<0.01)
#将DEG转换为数据框格式,并去掉含NA的行
DEG <- as.data.frame(res)
DEG <- na.omit(DEG)

#################火山图##############
library(ggrepel)
#确定差异表达倍数,设置动态阈值,也可以设置静态阈值,例如log2FoldChange=1或2
logFC_t <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)))
#取前两位小数
logFC_t <- round(logFC_t, 2)
#确定上下调基因
DEG$change = as.factor(ifelse(DEG$padj < 0.05 & abs(DEG$log2FoldChange) > logFC_t,
                              ifelse(DEG$log2FoldChange > logFC_t ,'UP','DOWN'),'STABLE'))
#table(DEG$change)
DEG$symbol <- rownames(DEG)
#设置阈值显示超过阈值的基因名
DEG$label <- ifelse(DEG$padj < 0.00001 & abs(DEG$log2FoldChange) >= 3,DEG$symbol,"")
#画火山图
pdf("volcano_plot.pdf")
ggplot(DEG, aes(x=log2FoldChange, y=-log10(padj),color=change)) + 
  geom_point(alpha=0.4, size=2) + 
  theme_classic() + 
  xlab("log2 fold change") +
  ylab("-log10 padj") +
  theme(plot.title = element_text(size=15,hjust = 0.5)) + 
  scale_colour_manual(values = c('blue','black','red')) +
  geom_hline(yintercept = -log10(0.05), lty = 2) +
  geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 2) +
  geom_label_repel(data = DEG, aes(label = label),
                   size = 3,box.padding = unit(0.5, "lines"),
                   point.padding = unit(0.8, "lines"),
                   segment.color = "black",
                   show.legend = FALSE, max.overlaps = 10000)
dev.off()

#############    pheatmap热图    ############
dat <- countdata
FC <- DEG$log2FoldChange
names(FC) <- rownames(DEG)
#排序差异倍数,提取前100和后100的基因名
DEG_200 <- c(names(head(sort(FC),100)),names(tail(sort(FC),100)))
#将提取的基因归一化
dat <- t(scale(t(dat[DEG_200,])))
#添加注释条
group <- data.frame(group=condition)
rownames(group)=colnames(dat)
pdf("pheatmap_plot.pdf")
pheatmap(dat, cluster_cols = T,
         show_colnames =F,show_rownames = F,
         annotation_col=group,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
dev.off()

#大于2的赋值为2
dat[dat > 2] <- 2
#低于2的赋值为-2
dat[dat < -2] <- -2
pdf("pheatmap_alter_plot.pdf")
pheatmap(dat, cluster_cols = T,
         show_colnames =F,show_rownames = F,
         annotation_col=group,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
dev.off()

###########    KEGG富集分析     ##########
library('dplyr')
library(org.Mm.eg.db)#选取物种数据库
library(clusterProfiler)
#### 第一步,确定上下调基因,已经完成
#### 第二步,从org.Mm.eg.db提取ENSG的ID和GI号对应关系
# 如果ENSG的ID具有小数点,用下面两句代码舍掉小数点
# library(stringr)
# rownames(DEG) <- str_sub(rownames(DEG), start = 1, end = 15)

#DEG添加一列ENSEMBL
DEG$ENSEMBL <- rownames(DEG)
# bitr in clusterProfiler,将ENSEMBL的ID转换为GI号保存为df
df <- bitr(rownames(DEG),
           fromType = "ENSEMBL",
           toType = "ENTREZID",
           OrgDb = org.Mm.eg.db)

#### 第三步,将GI号合并到DEG数据框内
DEG <- inner_join(DEG, df, by="ENSEMBL")
#### 第四步,提取上调和下调的GI集
gene_up <- DEG[DEG$change == "UP", "ENTREZID"] 
gene_down <- DEG[DEG$change == "DOWN", "ENTREZID"]
gene_diff <- c(gene_up, gene_down)
gene_all <- as.character(DEG[,"ENTREZID"])

geneList <- DEG$log2FoldChange
names(geneList) <- DEG$ENTREZID
geneList <- sort(geneList, decreasing = T)
geneList <- geneList[geneList != 0]

## KEGG pathway analysis
kk.up <- enrichKEGG(gene          =  gene_up,
                    organism      =  "mmu",
                    universe      =  gene_all,
                    pvalueCutoff  =  0.8,
                    qvalueCutoff  =  0.8)
kk.down <- enrichKEGG(gene          =  gene_down,
                      organism      =  "mmu",
                      universe      =  gene_all,
                      pvalueCutoff  =  0.8,
                      qvalueCutoff  =  0.8)

kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg <- kegg_down_dt[kegg_down_dt$pvalue < 0.05,]
down_kegg$group = -1
up_kegg <- kegg_up_dt[kegg_up_dt$pvalue < 0.05,]
up_kegg$group = 1

dat <- rbind(up_kegg, down_kegg)
dat$pvalue <- -log10(dat$pvalue)
dat$pvalue <- dat$pvalue * dat$group

dat <- dat[order(dat$pvalue, decreasing = F),]
pdf("KEGG.pdf")
ggplot(dat, aes(x = reorder(Description, order(pvalue, decreasing=F)), y = pvalue, fill = group)) + 
  geom_bar(stat = "identity") + 
  scale_fill_gradient(low = "blue", high = "red", guide = FALSE) + 
  scale_x_discrete(name = "Pathway names") +
  scale_y_continuous(name = "log10P-value") +
  coord_flip() + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Pathway Enrichment")
dev.off()

############   GO富集分析   #########
ego_up <- enrichGO(gene          = gene_up,
                   OrgDb         = org.Mm.eg.db,
                   ont           = "ALL" ,
                   universe      = gene_all,
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.99,
                   qvalueCutoff  = 0.99,
                   readable      = TRUE)
head(ego_up[,1:6])
ego_up_result <- ego_up@result[order(ego_up@result$pvalue),]
ego_up_sig <- rbind(head(subset(ego_up_result, ONTOLOGY == "BP"), 10),
                    head(subset(ego_up_result, ONTOLOGY == "CC"), 10),
                    head(subset(ego_up_result, ONTOLOGY == "MF"), 10))
ego_up_sig$pvalue <- -log10(ego_up_sig$pvalue)
ego_up_sig$Description <- factor(ego_up_sig$Description, levels = as.character(rev(ego_up_sig$Description)))
pdf("GO_up.pdf")
ggplot(ego_up_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
  geom_col() +
  coord_flip() +
  theme_classic() +
  facet_grid(ONTOLOGY~., scales = "free") +
  xlab("Up regulated")
dev.off()

ego_down <- enrichGO(gene          = gene_down,
                     OrgDb         = org.Mm.eg.db,
                     ont           = "ALL" ,
                     universe      = gene_all,
                     pAdjustMethod = "BH",
                     pvalueCutoff  = 0.99,
                     qvalueCutoff  = 0.99,
                     readable      = TRUE)
head(ego_down[,1:6])
ego_down_result <- ego_down@result[order(ego_down@result$pvalue),]
ego_down_sig <- rbind(head(subset(ego_down_result, ONTOLOGY == "BP"), 10),
                      head(subset(ego_down_result, ONTOLOGY == "CC"), 10),
                      head(subset(ego_down_result, ONTOLOGY == "MF"), 10))
ego_down_sig$pvalue <- -log10(ego_down_sig$pvalue)
ego_down_sig$Description <- factor(ego_down_sig$Description, levels = as.character(rev(ego_down_sig$Description)))
pdf("GO_down.pdf")
ggplot(ego_down_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
  geom_col() +
  coord_flip() +
  theme_classic() +
  facet_grid(ONTOLOGY~., scales = "free") +
  xlab("Down regulated")
dev.off()

你可能感兴趣的:(RNA-seq分析流程)