CUT&Tag数据分析笔记（2）

在开始笔记之前我只想吐槽一下，官网里这一部分的代码错的离谱！！！各种报错报的我怀疑人生。。。

第四节比对结果过滤和文件格式转换

（一）利用比对质量过滤比对上的【可选步骤】

有些项目可能需要对比对质量分数进行更严格的筛选。这里通过实例详细讨论了bowtie是如何分配质量分数的。

MAPQ(x) = -10 * log10(P(x is mapped wrongly)) = -10 * log10(p)

which ranges from 0 to 37, 40 or 42.

samtools view -q minQualityScore将消除所有低于用户定义的minQualityScore的比对结果。

#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag/alignment/"
minQualityScore=2

cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
samtools view -q $minQualityScore ${projPath}/${i}_bowtie2.sam >${projPath}/filtered_sam/${i}_bowtie2.qualityScore$minQualityScore.sam
done

（二）文件格式转换【必需步骤】

这一部分是为peak calling和可视化做准备所必需的，其中需要完成一些过滤和文件格式转换。

#!/bin/bash

projPath="/gpfs/home/fangy04/cut_tag/"

cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
## Filter and keep the mapped read pairs
samtools view -bS -F 0x04 $projPath/alignment/${name}_bowtie2.sam > $projPath/bam_files/${name}_bowtie2.mapped.bam

## Convert into bed file format
bedtools bamtobed -bedpe -i $projPath/bam_files/${name}_bowtie2.mapped.bam > $projPath/bed_files/${name}_bowtie2.bed

## Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
awk '$1==$4 && $6-$2 < 1000 {print $0}' $projPath/bed_files/${name}_bowtie2.bed > $projPath/bed_files/${name}_bowtie2.clean.bed

## Only extract the fragment related columns
cut -f 1,2,6 $projPath/bed_files/${name}_bowtie2.clean.bed | sort -k1,1 -k2,2n -k3,3n > $projPath/bed_files/${name}_bowtie2.fragments.bed
done

（三）评估重复样品的再现性（接上一篇笔记）

为了研究重复样品之间和跨条件下的重现性，基因组被分割成500 bp的bin，并在重复数据集之间计算每个bin中log2转换值的Pearson相关性。多个重复和IgG对照数据集显示在一个层次聚类相关矩阵中。

#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag"
binLen=500

cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' $projPath/bed_files/${i}_bowtie2.fragments.bed | sort -k1,1V -k2,2n | uniq -c | awk -v OFS="\t" '{print $2, $3, $1}'| sort -k1,1V -k2,2n > $projPath/bed_files/${i}_bowtie2.fragmentsCount.bin$binLen.bed 
done

在R中可视化：

> library(magrittr)
> library(dplyr)
> library(tidyverse)
> library(corrplot)
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
histList = c("K27m3", "K4m3", "IgG")

> reprod = c()
> fragCount = NULL
> for(hist in sampleList){
  
  if(is.null(fragCount)){
    
    fragCount = read.table(paste0(hist, "_bowtie2.fragmentsCount.bin500.bed"), header = FALSE) 
    colnames(fragCount) = c("chrom", "bin", hist)
    
  }else{
    
    fragCountTmp = read.table(paste0(hist, "_bowtie2.fragmentsCount.bin500.bed"), header = FALSE)
    colnames(fragCountTmp) = c("chrom", "bin", hist)
    fragCount = full_join(fragCount, fragCountTmp, by = c("chrom", "bin"))
    
  }
}

> M = cor(fragCount %>% select(-c("chrom", "bin")) %>% log2(), use = "complete.obs") 

> corrplot(M, method = "color", outline = T, addgrid.col = "darkgray", order="hclust", addrect = 3, rect.col = "black", rect.lwd = 3,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, addCoef.col = "black", number.digits = 2, number.cex = 1, col = colorRampPalette(c("midnightblue","white","darkred"))(100))

第五节 Spike-in校准

这部分是可选的，但根据你的实验方法来决定是否进行这一步。我们已经在第3.1.2节展示了spike-in基因组的比对方式，并在第3.2.2节展示了spike-in比对总结。

潜在的假设是，在一系列样本中，原基因组比对上的片段与大肠杆菌基因组比对上的片段的比例是相同的，每个样本使用相同数量的细胞。由于这个假设，我们没有在实验之间或纯化的pA-Tn5批次之间进行标准化处理，因为这些批次的大肠杆菌DNA携带量可能非常不同。使用常数C来避免归一化数据中的小分数，我们定义一个scaling factor S为：

S = C / (fragments mapped to E. coli genome)

然后计算Normalized coverage:

Normalized coverage = (primary_genome_coverage) * S

这个常数是一个任意的乘数，通常是10000。产生的文件相对较小，如基因组coverage bedgraph文件。

#!/bin/bash

projPath="/gpfs/home/fangy04/cut_tag"
chromSize="/gpfs/home/fangy04/cut_tag/hg38.chrom.size"

cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do

seqDepthDouble=`samtools view -F 0x04 $projPath/alignment/${i}_spikein.sam | wc -l`
seqDepth=$((seqDepthDouble/2))

if [[ "$seqDepth" -gt "1" ]]; then
    scale_factor=`echo "10000 / $seqDepth" | bc -l`
    echo "Scaling factor for $histName is: $scale_factor!"
    bedtools genomecov -bg -scale $scale_factor -i $projPath/bed_files/${i}_bowtie2.fragments.bed -g $chromSize > $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph
fi

done

（一）Scaling factor

> library(magrittr)
> library(dplyr)
> library(tidyverse)
> library(corrplot)
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
> histList = c("K27m3", "K4m3", "IgG")

> scaleFactor = c()
> multiplier = 10000
> for(hist in sampleList){
  spikeDepth = read.table(paste0(hist, "_bowtie2_spikeIn.seqDepth"), header = FALSE, fill = TRUE)$V1[1]
  
  histInfo = strsplit(hist, "_")[[1]]
  scaleFactor = data.frame(scaleFactor = multiplier/spikeDepth, Histone = histInfo[3], Replicate = histInfo[4])  %>% rbind(scaleFactor, .)
}
> scaleFactor$Histone = factor(scaleFactor$Histone, levels = histList)
> combine_scaleFactor <- left_join(alignDupSummary, scaleFactor, by = c("Histone", "Replicate"))
> combine_scaleFactor
Histone Replicate SequencingDepth MappedFragNum_hg38 AlignmentRate_hg38 MappedFragNum_spikeIn AlignmentRate_spikeIn DuplicationRate
1   K27m3      rep1         2984630            2860326             95.96%                   235                 0.01%         4.8544%
2   K27m3      rep2         2702260            2607267             96.57%                   487                 0.02%         1.0424%
3    K4m3      rep1         1581710            1494319             94.86%                   375                 0.02%         6.5808%
4    K4m3      rep2         1885056            1742643             92.85%                  4441                 0.24%         2.6899%
5     IgG      rep1         2127635            1749023             82.73%                 75767                 3.56%        81.4229%
6     IgG      rep2         2192908            1994753             91.17%                 79137                 3.61%        34.2772%
7     IgG      rep2         2192908            1994753             91.17%                 79137                 3.61%        34.2772%
8     IgG      rep2         1168663            1076138             92.16%                 42122                  3.6%        39.6811%
9     IgG      rep2         1168663            1076138             92.16%                 42122                  3.6%        39.6811%
  EstimatedLibrarySize UniqueFragNum scaleFactor
1             28538043     2725126.0  42.5531915
2            124296273     2582370.8  20.5338809
3             10894128     1401681.3  26.6666667
4             31948293     1703223.5   2.2517451
5               328542      326994.5   0.1319836
6              2202634     1313921.7   0.1263631
7              2202634     1313921.7   0.2374056
8               967425      649647.8   0.1263631
9               967425      649647.8   0.2374056

可视化：

## Generate sequencing depth boxplot
> fig6A = scaleFactor %>% ggplot(aes(x = Histone, y = scaleFactor, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 20) +
  ylab("Spike-in Scalling Factor") +
  xlab("")

> normDepth = inner_join(scaleFactor, alignResult, by = c("Histone", "Replicate")) %>% mutate(normDepth = MappedFragNum_hg38 * scaleFactor)

> fig6B = normDepth %>% ggplot(aes(x = Histone, y = normDepth, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 20) +
  ylab("Normalization Fragment Count") +
  xlab("") + 
  coord_cartesian(ylim = c(1000000, 130000000))
> ggarrange(fig6A, fig6B, ncol = 2, common.legend = TRUE, legend="bottom")

第六节 Peak Calling

（一）SEACR

对CUT&RUN的稀疏富集分析SEACR包的设计用于从非常低的背景(即没有read覆盖的区域)的染色质中富集区域并call peaks，这是典型的CUT&Tag染色质分析实验。SEACR需要来自paired-end测序的bedGraph文件作为输入，并将峰定义为不与IgG对照数据集中描绘的背景信号块重叠的连续碱基对覆盖块。SEACR既能有效地从因子结合位点calling窄峰，也能calling出某些组蛋白修饰特征的宽域。该方法的描述发表在2019年的Meers et al. 2019上，用户手册可在github上查阅。由于我们已经使用大肠杆菌read count对片段计数进行了规范化，所以我们将SEACR的规范化选项设置为“non”。否则，建议使用“norm”。

首先要先下载seacr.sh和Rscript，下载地址：https://github.com/FredHutch/SEACR/

另外，你需要把seacr.sh里的一部分修改掉，换成你的文件所在目录：

第三，你还要保证你的电脑里安装了R和bedtools，因为seacr.sh脚本要调用它们。

然后写脚本（这里脚本我做了改动，因为官网的代码实在是。。。无力吐槽了）：

#!/bin/bash

projPath="/gpfs/home/fangy04/cut_tag"
seacr="/gpfs/home/fangy04/cut_tag/seacr.sh"

cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
bash $seacr $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph \
     $projPath/bedgraph/SH_Hs_IgG_rep1_bowtie2.fragments.normalized.bedgraph \ #这里和官网不一样，因为如果按照官网的代码是报错的，我查了seacr的官网，这里应该是IgG的bedgraph文件，所以我直接用的IgG_rep1的bedgraph文件
     non stringent $projPath/peak_calling/${i}_seacr_control.peaks

bash $seacr $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph 0.01 non stringent $projPath/peak_calling/${i}_seacr_top0.01.peaks

done

然后生成这些文件：

$ ll
total 33024
-rw------- 1 fangy04 fangy04   163240 Dec 24 10:15 SH_Hs_IgG_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04  8463714 Dec 24 10:16 SH_Hs_IgG_rep2_R1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   620497 Dec 24 10:16 SH_Hs_IgG_rep2_R1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04  2543921 Dec 24 10:17 SH_Hs_IgG_rep2_R2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   315210 Dec 24 10:17 SH_Hs_IgG_rep2_R2_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 11751678 Dec 24 10:17 SH_Hs_K27m3_rep1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   381246 Dec 24 10:18 SH_Hs_K27m3_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04  7449627 Dec 24 10:19 SH_Hs_K27m3_rep2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   604217 Dec 24 10:19 SH_Hs_K27m3_rep2_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   329727 Dec 24 10:19 SH_Hs_K4m3_rep1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04    58395 Dec 24 10:19 SH_Hs_K4m3_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   516929 Dec 24 10:20 SH_Hs_K4m3_rep2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04   234455 Dec 24 10:20 SH_Hs_K4m3_rep2_seacr_top0.01.peaks.stringent.bed

（1）Number of peaks called

然后在R里统计：

##=== R command ===## 
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
> peakN = c()
> peakWidth = c()
> peakType = c("control", "top0.01")
> for(hist in sampleList){
  histInfo = strsplit(hist, "_")[[1]]
  if(histInfo[3] != "IgG"){
    for(type in peakType){
      peakInfo = read.table(paste0(hist, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE)  %>% mutate(width = abs(V3-V2))
      peakN = data.frame(peakN = nrow(peakInfo), peakType = type, Histone = histInfo[3], Replicate = histInfo[4]) %>% rbind(peakN, .)
      peakWidth = data.frame(width = peakInfo$width, peakType = type, Histone = histInfo[4], Replicate = histInfo[4])  %>% rbind(peakWidth, .)
    }
  }
}
> peakN %>% select(Histone, Replicate, peakType, peakN)
Histone Replicate peakType  peakN
1   K27m3      rep1  control 185890
2   K27m3      rep1  top0.01   5849
3   K27m3      rep2  control 117190
4   K27m3      rep2  top0.01   9619
5    K4m3      rep1  control   5048
6    K4m3      rep1  top0.01    880
7    K4m3      rep2  control   8187
8    K4m3      rep2  top0.01   3736

（2）在生物学重复里peaks的再现性

对重复数据集的peak calling与定义可重复峰进行比较。选取峰的前1%(按每个区块的总信号进行排序)作为高置信度点。

这一块实际上我是比较困惑的。首先就是官网的代码，如果你完全按照代码来，最后算出来的peakReprodRate（可重现峰比例）和peakreprod（可重现峰数量）都是NA，所以下面一块的代码我改动较多，同学可以去官网运行他的代码体验一下。第二就是如果想看不同生物学重复之间的再现性，是不是只需要看rep2对于rep1的再现性呢？因为我改动代码后，所有rep1的peakReprodRate都是100%，而所有rep2都是不到100%的一个数值。如果有同学也练习过这个流程欢迎讨论~

> library(GenomicRanges)
> library(magrittr)
> library(dplyr)

> histL = c("SH_Hs_K27m3", "SH_Hs_K4m3")
> repL = c("rep1","rep2")
> peakType = c("control", "top0.01")
> peakOverlap = c()
> for(type in peakType){
    for(hist in histL){
    overlap.gr = GRanges()
      for(rep in repL){
        peakInfo = read.table(paste0(hist, "_", rep, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE)
        peakInfo.gr = GRanges(peakInfo$V1, IRanges(start = peakInfo$V2, end = peakInfo$V3), strand = "*")
        if(length(overlap.gr) >0){
          overlap.gr = overlap.gr[findOverlaps(overlap.gr, peakInfo.gr)@from]
        }else{
          overlap.gr = peakInfo.gr
      }
      peakOverlap = data.frame(peakReprod = length(overlap.gr), Histone = hist, Replicate = rep, peakType = type) %>% rbind(peakOverlap, .)     
    }
  }
}

> peakOverlap = peakOverlap[c(1,5,2,6,3,7,4,8),]#必须
> peakReprod = data.frame(peakOverlap[,c(2,3,4,1)], peakN = peakN$peakN) %>% mutate(peakReprodRate = peakReprod/peakN * 100)
> peakReprod

Histone Replicate peakType peakReprod  peakN peakReprodRate
1 SH_Hs_K27m3      rep1  control     185890 185890      100.00000
2 SH_Hs_K27m3      rep1  top0.01       5849   5849      100.00000
3 SH_Hs_K27m3      rep2  control     106767 117190       91.10590
4 SH_Hs_K27m3      rep2  top0.01       5271   9619       54.79780
5  SH_Hs_K4m3      rep1  control       5048   5048      100.00000
6  SH_Hs_K4m3      rep1  top0.01        880    880      100.00000
7  SH_Hs_K4m3      rep2  control       5046   8187       61.63430
8  SH_Hs_K4m3      rep2  top0.01        875   3736       23.42077

（3）FRagment proportion in Peaks regions (FRiPs)

我们计算峰中的reads (FRiPs)作为信噪比的度量，并将其与IgG数据集中的FRiPs进行对比以作说明。虽然CUT&Tag的测序深度通常只有1- 500万reads，但该方法的低背景导致高FRiP分数。

> library(chromVAR)
> library(GenomicRanges)
> library(magrittr)
> library(dplyr)
> bamDir = paste0(projPath, "F:/cut_tag/bam_files")#bam文件的目录
> inPeakData = c()
> histL = c("SH_Hs_K27m3", "SH_Hs_K4m3")
> repL = c("rep1","rep2")
## overlap with bam file to get count
> for(hist in histL){
    for(rep in repL){
      peakRes = read.table(paste0(hist, "_", rep, "_seacr_control.peaks.stringent.bed"), header = FALSE, fill = TRUE)
      peak.gr = GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
      bamFile = paste0(bamDir, "/", hist, "_", rep, "_bowtie2.mapped.bam")
      fragment_counts <- getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
      inPeakN = counts(fragment_counts)[,1] %>% sum
      inPeakData = rbind(inPeakData, data.frame(inPeakN = inPeakN, Histone = hist, Replicate = rep))
    }
  }

> frip = data.frame(alignResult[c(1:4),]) %>% mutate(FragInPeakNum = inPeakData$inPeakN) %>% mutate(FRiPs = inPeakN/MappedFragNum_hg38 * 100)

（4）可视化peak number, peak width, peak reproducibility and FRiPs.

> library(ggplot2)
> library(ggpubr)
> library(ggridges)
> library(viridis)
> fig7A = peakN %>% ggplot(aes(x = Histone, y = peakN, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  facet_grid(~peakType) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ggpubr::rotate_x_text(angle = 30) +
  ylab("Number of Peaks") +
  xlab("")

> fig7B = peakWidth %>% ggplot(aes(x = Histone, y = width, fill = Histone)) +
  geom_violin() +
  facet_grid(Replicate~peakType) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  scale_y_continuous(trans = "log", breaks = c(400, 3000, 22000)) +
  theme_bw(base_size = 18) +
  ggpubr::rotate_x_text(angle = 30) +
  ylab("Width of Peaks") +
  xlab("")

> fig7C = peakReprod %>% ggplot(aes(x = Histone, y = peakReprodRate, fill = Histone, label = round(peakReprodRate, 2))) +
  geom_bar(stat = "identity") +
  geom_text(vjust = 0.1) +
  facet_grid(Replicate~peakType) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ggpubr::rotate_x_text(angle = 30) +
  ylab("% of Peaks Reproduced") +
  xlab("")


> fig7D = frip %>% ggplot(aes(x = Histone, y = FRiPs, fill = Histone, label = round(FRiPs, 2))) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ylab("% of Fragments in Peaks") +
  xlab("")

> ggarrange(fig7A, fig7B, fig7C, fig7D, ncol = 2, nrow=2, common.legend = TRUE, legend="bottom")