前言
因为以前的分析流程要在Shell 和 R 之间切换,然后可控性还差,有点烦,正好最近有点数据要分析,干脆就重新建立一个 all in one 的流程,方便以后使用。
运行环境
Ubuntu 18.04
R 3.6.1
用到的软件
fastp
bowtie2
hisat2
stringtie
DESeq2
Refference data
rRNA index:用bowtie2去除fastq文件中的rRNA
tRNA index:http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa
Hisat2 index:https://cloud.biohpc.swmed.edu/index.php/s/grch38_tran/download
GRCh38.96.gtf:ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
Homo_sapiens.GRCh38.dna.primary_assembly.fa:ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz
代码
############## Functions #####################
#Hisat2 参数
grch38_tran_index = "/home/zhou/RNASEQ/index/grch38_tran/genome_tran"
rRNA_index = "/home/zhou/RNASEQ/index/rRNA_index/human_rRNA"
tRNA_index = "/home/zhou/RNASEQ/index/tRNA_index_hg38/hg38_tRNA"
GRCh38.96.gtf = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.96.gtf"
fasta = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
#fastp preprocess
##Single version
fastp_SE <- function(input_fastq = input_fastq) {
fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_fastpedited.fastq.gz")
fastp_cmd = paste(fastp_PATH, "-i", input_fastq, "-o", output_fastq, "-w 16", "-z 9")
system(fastp_cmd,wait = T)
return(output_fastq)
}
##Paired version
fastp_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R) {
fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
output_fastq_F = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_F),"_fastpedited.fastq.gz")
output_fastq_R = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_R),"_fastpedited.fastq.gz")
fastp_cmd = paste(fastp_PATH, "-i", input_fastq_F, "-o", output_fastq_F, "-I", input_fastq_R, "-O", output_fastq_R, "-w 16", "-z 9")
system(fastp_cmd,wait = T)
return(c(output_fastq_F,output_fastq_R))
}
#rRNA remove
##Single version
remove_rRNA_SE <- function(input_fastq = input_fastq, index = index) {
bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmrRNA.fastq.gz")
bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
system(bowtie2_cmd,wait = T)
system("rm tmp.sam")
return(output_fastq)
}
##Paired version
remove_rRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmrRNA.fastq.gz")
bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
system(bowtie2_cmd,wait = T)
system("rm tmp.sam",wait = T)
system(rename_cmd_F,wait = T)
system(rename_cmd_R,wait = T)
return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#tRNA remove
##Single version
remove_tRNA_SE <- function(input_fastq = input_fastq, index = index) {
bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmtRNA.fastq.gz")
bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
system(bowtie2_cmd,wait = T)
system("rm tmp.sam")
return(output_fastq)
}
##Paired version
remove_tRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmtRNA.fastq.gz")
bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
system(bowtie2_cmd,wait = T)
system("rm tmp.sam",wait = T)
system(rename_cmd_F,wait = T)
system(rename_cmd_R,wait = T)
return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#hisat2
##Single version
hisat2_SE <- function(index = index, input_fastq = input_fastq, output_prefix = output_prefix) {
bam_filename = paste0(output_prefix,".bam")
hisat2_PATH = "/home/zhou/bin/hisat2"
hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -U", input_fastq, " | samtools sort -@ 20 -o", bam_filename)
system(hisat2_cmd,wait = T)
return(bam_filename)
}
##Paired version
hisat2_PE <- function(index = index, input_fastq_F = input_fastq_F,input_fastq_R = input_fastq_R, output_prefix = output_prefix) {
bam_filename = paste0(output_prefix,".bam")
hisat2_PATH = "/home/zhou/bin/hisat2"
hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -1", input_fastq_F, "-2", input_fastq_R, " | samtools sort -@ 20 -o", bam_filename)
system(hisat2_cmd,wait = T)
return(bam_filename)
}
#stringtie
stringtie <- function(input_bam = input_bam, gtf = gtf) {
output_prefix = gsub(pattern = "\\.bam",replacement = "",x = input_bam)
tab_filename = paste0(output_prefix,".tab")
gtf_filename = paste0(output_prefix,".gtf")
stringtie_PATH = "/home/zhou/bin/stringtie"
stringtie_cmd = paste(stringtie_PATH,"-p 20 -G", gtf, "-o", gtf_filename, "-e -l", output_prefix, input_bam)
system(stringtie_cmd,wait = T)
return(gtf_filename)
}
#generate gene\transcript count matrix
run.prepDE.py <- function(inputfile = inputfile) {
run.cmd = paste("python ~/Software/prepDE.py -i",inputfile)
system(run.cmd,wait = T)
}
################ test data ############################
setwd("~/ribosome/mTOR_Liver_cell/testdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1_test.fq.gz|_R2_test.fq.gz",replacement = ""))
for (i in sample_pattern) {
print(i)
### fastp preprocess
fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1_test.fq.gz"),input_fastq_R = paste0(i,"_R2_test.fq.gz"))
### remove rRNA
rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
### remove tRNA
rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
### hisat2 mapping
hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
### stringtie assemble
stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}
################ real data ############################
#rnaseq
setwd("~/ribosome/mTOR_Liver_cell/RNA-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))
for (i in sample_pattern) {
print(i)
### fastp preprocess
fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
### remove rRNA
rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
### remove tRNA
rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
### hisat2 mapping
hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
### stringtie assemble
stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}
## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")
## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)
library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))
database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res)
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
total.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])
#riboseq
setwd("~/ribosome/mTOR_Liver_cell/RIBO-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))
for (i in sample_pattern) {
print(i)
### fastp preprocess
fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
### remove rRNA
rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
### remove tRNA
rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
### hisat2 mapping
hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
### stringtie assemble
stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}
## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")
## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)
library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))
database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res)
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
Ribo.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])
结语
把所有功能都模块化了,真舒服啊!
此外,有些function可以增加参数来进一步增加自由度,比如说线程数,我懒得写了,其实可以慢慢完善的,以后看心情把,毕业季实在太忙了。。。