免疫组库分析实战/mixcr+vdjtools+R实现

使用《OncoImmunology》一篇paper中的数据进行实战


  • 数据获取地址:https://www.ncbi.nlm.nih.gov/bioproject/PRJEB33490
    关于免疫组库的背景,MIXCR和VDJTOOLS的使用可以参考之前的文章
    使用mixcr构建免疫组库及下游分析
    使用vdjtools进行免疫组库分析
    直接进入主题

1.下载数据

1.SRR_Acc_List.txt (所有测序数据的样本号,直接下载得到)
2.SraRunTable.txt (样本的分组信息,后面数据分析会用到)

#linux
prefetch -O ./rawdata --option-file ./SRR_Acc_List.txt #批量下载数据
cat SRR_Acc_List.txt | while read id; do fastq-dump --split-files --gzip -O ./rawdata/ ./rawdata/${id}/${id}.sra;done # sra to fastq
fastqc -o ./qc ./rawdata/*gz #质控

详细方法参考:RNA-seq转录组差异分析及可视化

2.使用MIXCR构建免疫组库

#只分析TRB
cat SRR_Acc_List.txt|while read name
do 
mixcr analyze amplicon \
-s hsa \
--starting-material dna \
--5-end v-primers \
--3-end c-primers \
--adapters adapters-present \
--only-productive \
--receptor-type trb \
--report ./reports/${name}.report \
./rawdata/${name}_1.fastq.gz ./rawdata/${name}_2.fastq.gz ./results/${name}
done

详细方法参考:使用mixcr构建免疫组库及下游分析

3.VDJTOOLS分析

这一步需要用到之前下载的SraRunTable.txt,处理得到想要的分组文件。

分组文件

前两列必须为“file_name”,"sample_id",后面列可根据需求进行分组
第一列是MIXCR恢复出来数据的路径,第二列为样本名称

vdjtools=~/project/biosoft/vdjtools/vdjtools-1.2.1/vdjtools-1.2.1.jar
mkdir ./results/TRB/VDJconvert/
mkdir ./results/vdjtools-results/
java -jar $vdjtools Convert -S mixcr -m ./results/TRB/metadata.txt ./results/TRB/VDJconvert/ 
java -jar $vdjtools CalcBasicStats -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools CalcSegmentUsage -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools RarefactionPlot -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools CalcDiversityStats -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/

转换格式后在目标文件夹会生成一个新的metadata.txt,后续分析使用此分组文件。
共得到6个结果文件分别为basicstats.txtdiversity.strict.exact.txtdiversity.strict.resampled.txtrarefaction.strict.txtsegments.wt.J.txtsegments.wt.V.txt

4.使用R包进行可视化处理

4.1 clonality diversity richness

inputpath <- "./results/vdjtools-results/"
outputpath <- "./R-results/"
library(ggplot2)
library(reshape2)
library(ggpubr)
df <- read.csv(paste0(inputpath,"diversity.strict.exact.txt"), sep = "\t",stringsAsFactors = FALSE)
df$real_diversity <- log(df$shannonWienerIndex_mean) 
df$clonality <- 1 - df$normalizedShannonWienerIndex_mean 
work_df <- df
sample_name <- work_df$sample_id
Group <- work_df$sample_type
richness<- work_df$observedDiversity_mean
diversity <- work_df$real_diversity
clonality <- work_df$clonality
work_df <- data.frame(Group,richness,diversity,clonality)
work_df$Group <- factor(work_df$Group)  
work_res<- melt(work_df, id.vars = "Group")
work_richness <- work_res[which(work_res$variable=="richness"),]
work_diversity <- work_res[which(work_res$variable=="diversity"),]
work_clonality <- work_res[which(work_res$variable=="clonality"),]
p1 <- ggboxplot(work_diversity, x = "Group", y = "value",
                legend = "right",color = "black", palette = "jama",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Diversity",  xlab = "",size = 0.5,width =0.5,
                ylab = "Shannon Diversity Index of \n TCR repertoire") +
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p1 + stat_compare_means(aes(group = work_df$Group), 
                        label.x = 2,label = "p.signif",
                        method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Diversity.png"))
dev.off()
p2 <- ggboxplot(work_richness, x = "Group", y = "value",
                legend = "right",color = "black", palette = "jama",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Richness",  xlab = "",size = 0.5,width =0.5,
                ylab = "Richness of \n TCR repertoire")+
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p2 + stat_compare_means(aes(group =work_df$Group), 
                        label.x = 2,label = "p.signif",
                        method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Richness.png"))
dev.off()
p3 <- ggboxplot(work_clonality, x = "Group", y = "value",
                legend = "right",color = "black", palette = "jama",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Clonality",  xlab = "",size = 0.5,width =0.5,
                ylab = "Clonality Index of \n TCR repertoire")+
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p3 + stat_compare_means(aes(group =work_df$Group),  
                        label = "p.signif",label.x = 2,
                        method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Clonality.png"))
dev.off()
Clonality

Diversity

Richness

4.2Gene Usage热图绘制

inputpath <- "./results/vdjtools-results/"
outputpath <- "./R-results/"
library(pheatmap)
file_input <- c(paste0(inputpath,"segments.wt.V.txt"),
                paste0(inputpath,"segments.wt.J.txt"))
file_output <- c(paste0(outputpath,"V_heatmap.png"),paste0(outputpath,"J_heatmap.png")) 
for (i in 1:length(file_input)){
  df <- read.csv(file_input[i],sep="\t",row.names = 1)
  annotation_col <- data.frame(df$sample_type)
  colnames(annotation_col) <- "Type"
  rownames(annotation_col) <- rownames(df)
  work_df <- df
  work_df <- work_df[,-1:-2] # check this and change
  work_df <- t(work_df)
  work_df <- work_df[rowSums(work_df)!=0,]
  pheatmap(work_df,filename = file_output[i],
           annotation_col = annotation_col,
           cluster_cols = FALSE, #height = 10,
           display_numbers = FALSE,labels_col ="" )# check this and change
}
V_heatmap

J_heatmap

4.3 VJ基因的差异分析及火山图绘制

mymetadata <- read.csv("./results/TRB/VDJconvert/metadata.txt",sep="\t",stringsAsFactors = F)
sample_name <- mymetadata$sample_id
vgene_matrix <- function(df){
  vgene_name <- c()
  for (i in 1:nrow(df)){
    if (df[i,"v"] %in% names(vgene_name)){
      vgene_name[df[i,"v"]] <- vgene_name[df[i,"v"]] + df[i,"count"]
    } else{
      vgene_name[df[i,"v"]] <- df[i,"count"]
    }
  }
  return(vgene_name)
}
jgene_matrix <- function(df){
  jgene_name <- c()
  for (i in 1:nrow(df)){
    if (df[i,"j"] %in% names(jgene_name)){
      jgene_name[df[i,"j"]] <- jgene_name[df[i,"j"]] + df[i,"count"]
    } else{
      jgene_name[df[i,"j"]] <- df[i,"count"]
    }
  }
  return(jgene_name)
}
VGeneMatrix <- data.frame()
JGeneMatrix <- data.frame()
matrix_V <- function(df,V,sample_name){
  for (i in 1:nrow(df)){
    if (df[i,"v"] %in% colnames(V)){
      V[is.na(V)] <- 0
      V[sample_name[n],df[i,"v"]] <- V[sample_name[n],df[i,"v"]] + df[i,"count"]
    } else{
      V[sample_name[n],df[i,"v"]] <- df[i,"count"]
    }
  }
  return(V)
}
matrix_J <- function(df,J,sample_name){
  for (i in 1:nrow(df)){
    if (df[i,"j"] %in% colnames(J)){
      J[is.na(J)] <- 0
      J[sample_name[n],df[i,"j"]] <- J[sample_name[n],df[i,"j"]] + df[i,"count"]
    } else{
      J[sample_name[n],df[i,"j"]] <- df[i,"count"]
    }
  }
  return(J)
}

for (n in 1:length(sample_name)){
  df <- read.csv(paste0("./results/TRB/VDJconvert/",sample_name[n],".txt"),sep="\t",stringsAsFactors = F)
  # assign(paste0(sample_name[n],"_Vgene"),vgene_matrix(df))
  # assign(paste0(sample_name[n],"_Jgene"),jgene_matrix(df)) 每个样本的表达情况
  VGeneMatrix <- matrix_V(df,VGeneMatrix,sample_name)
  JGeneMatrix <- matrix_J(df,JGeneMatrix,sample_name)
}
write.table(VGeneMatrix,file = paste0(outputpath,"matrix_V.txt"),sep="\t",quote = F)
write.table(JGeneMatrix,file = paste0(outputpath,"matrix_J.txt"),sep="\t",quote = F)

#########DESeq2#######

library(DESeq2)
library(ggpubr)
library(ggthemes)
mycounts <- t(JGeneMatrix)
mycounts <- mycounts[rowSums(mycounts)!=0,]
mymetas <- mymetadata[,2:3]
colnames(mycounts) == mymetas$sample_id
mymetas$sample_type <- factor(mymetas$sample_type)

dds <- DESeqDataSetFromMatrix(countData = mycounts, colData = mymetas,desig=~sample_type)
dds <- DESeq(dds)
res <- results(dds,contrast = c("sample_type","healthy","hematological_cancer"))
res <- data.frame(res)
DEG <- res[order(res$padj),] 


DEG$change =as.factor(ifelse(DEG$pvalue < 0.05 & DEG$log2FoldChange > 0.5,'UP',
                             
                             ifelse(DEG$pvalue < 0.05 & DEG$log2FoldChange < -0.5,'DOWN',
                                    
                                    "NOT")))
DEG$logp = -log10(DEG$pvalue)


table(DEG$change)

DEG <- DEG[order(DEG$pvalue),]
up_genes <- head(row.names(DEG)[which(DEG$change == "UP")],10)
down_genes <- head(row.names(DEG)[which(DEG$change =="DOWN")],10)
DEG_top <- c(as.character(up_genes),as.character(down_genes))
DEG$label  <- ""
DEG$label[match(DEG_top,row.names(DEG))] <- DEG_top

ggscatter(DEG, x = "log2FoldChange", y = "logp",
          color = "change",
          palette = c("#2f5688","#BBBBBB","#CC0000"),
          size = 6,
          label = DEG$label,
          font.label = 10,
          repel = T,
          xlab = "log2FoldChange",
          ylab ="-log10(p value)") + theme_base() +
  geom_hline(yintercept = -log10(0.05),linetype="dashed")+
  geom_vline(xintercept = c(-0.5,0.5),linetype="dashed")

ggsave("./results/J_gene_valcano.png",dpi=600)
dev.off()
V_gene_valcano

J_gene_valcano

你可能感兴趣的:(免疫组库分析实战/mixcr+vdjtools+R实现)