斑马鱼CUT&tag分析,全自动脚本


#注意修改filenames 文件,改文件内容为本次实验分析的样本名
##bowtie2 to zebrafish
#用bowtie2 构建 斑马鱼基因组索引
#下载基因组 在UCSC 一定要在UCSC下载基因组信息,因为最后注释时用的R包格式是UCSC的,这点一定注意,ensembl的不能用,血的教训
nohup wget http://hgdownload.soe.ucsc.edu/goldenPath/danRer11/bigZips/danRer11.fa.gz
#构建索引index bowtie2-build [options]*  
bowtie2-build --threads 8 danRer11.fa.gz danRer11_UCSC      #8线程开跑,好像没啥用
mkdir -p sam/bowtie2_summary

projPath="/home/lyo/WQBioinfo"
dos2unix filenames.txt

cat filenames.txt | while read i
do
 bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x /home/lyo/index/GRCz11Genome/GRCz11 -1 ${i}_1.fq.gz -2 ${i}_2.fq.gz -S ./sam/${i}.sam &> ./sam/bowtie2_summary/${i}_bowtie2.txt
done

cat filenames.txt | while read i
do
 bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x /home/lyo/index/ecoli/ecoli -1 ${i}_1.fq.gz -2 ${i}_2.fq.gz -S $projPath/sam/${i}_spikein.sam &> $projPath/sam/bowtie2_summary/${i}_bowtie2_spikein.txt 
 seqDepthDouble=`samtools view -F 0x04 $projPath/sam/${i}_spikein.sam | wc -l`
 seqDepth=$((seqDepthDouble/2))
 echo $seqDepth > $projPath/sam/bowtie2_summary/${i}_bowtie2_spikeIn.seqDepth

done


#######评估回帖片段大小分布
mkdir -p ./sam/fragmentLen

cd sam
dos2unix filenames.txt

cat filenames.txt | while read i
do
 samtools view -F  0x04 ${i}.sam |awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >./fragmentLen/${i}_fragmentLen.txt

done

## 筛选和保留比对上的双端 reads 
#sam转bam
cat filenames.txt | while read i
do
 samtools view -bS -F 0x04 ${i}.sam > ${i}.bam
done

##bam转bw文件
mkdir -p bw    
cat filenames.txt | while read i 
do                                                                                                                   
samtools sort  ./${i}.bam -o ./${i}.sorted.bam                                                     
samtools index ./${i}.sorted.bam                                                                                                              
bamCoverage -b ./${i}.sorted.bam -o ./bw/${i}.bw 
done


####################去除重复 只需Input去重,其他不去重###########################
#按照坐标排序
picard SortSam I= ./sam/Input.sam O= ./sam/Input.sorted.sam SORT_ORDER=coordinate
#标记重复
mkdir -p sam/picard_summary   #一定要创建,不然会报错
picard MarkDuplicates \
-I./sam/Input.sorted.sam \
-O./sam/Input.sorted.dupMarked.sam \
-M./sam/picard_summary/Input_picard.dupMark.txt
## 去除重复 reads ,只需INPUT IgG去重
picard MarkDuplicates I=./sam/Input.sorted.dupMarked.sam \
O=./sam/Input.sorted.rmDup.sam \
REMOVE_DUPLICATES=true \
M=./sam/picard_summary/Input_picard.rmDup.txt

#INPUT 去重后单独拉出来做
samtools view -F  0x04 Input.sam |awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >./fragmentLen/Input_fragmentLen.txt
samtools view -bS -F 0x04 Input.sorted.rmDup.sam > Input.sorted.rmDup.mapped.bam
bedtools bamtobed -bedpe -i Input.sorted.rmDup.mapped.bam  > ./bed/Input.sorted.rmDup.mapped.bed
awk '$1==$4 && $6-$2 < 1000 {print $0}' ./bed/Input.sorted.rmDup.mapped.bed > ./bed/Input.sorted.rmDup.mapped.clean.bed
cut -f 1,2,6 ./bed/Input.sorted.rmDup.mapped.clean.bed | sort -k1,1 -k2,2n -k3,3n  > ./bed/Input.fragments.bed

binLen=500
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./bed/Input.fragments.bed |\
sort -k1,1V -k2,2n |\
uniq -c |\
awk -v OFS="\t" '{print $2, $3, $1}' |\
sort -k1,1V -k2,2n  >./bed/Input.fragmentsCount.bin$binLen.bed

#以下为call peak 所需


#bam to bed
mkdir -p bed
cat filenames.txt | while read i
do
 bedtools bamtobed -bedpe -i ${i}.bam  > ./bed/${i}.bed
done

## 保留那些在同一条染色体且片段长度小于 1000bp 的双端 reads,
## 仅提取片段相关的列 
mkdir -p bed
cat filenames.txt | while read i
do
 awk '$1==$4 && $6-$2 < 1000 {print $0}' ./bed/${i}.bed > ./bed/${i}_1000.bed 
 cut -f 1,2,6 ./bed/${i}_1000.bed | sort -k1,1 -k2,2n -k3,3n  > ./bed/${i}.fragments.bed
done

#评估重复性
binLen=500
cat filenames.txt | while read i
do 
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./bed/${i}.fragments.bed | sort -k1,1V -k2,2n | uniq -c | awk -v OFS="\t" '{print $2, $3, $1}' | sort -k1,1V -k2,2n  >./bed/${i}.fragmentsCount.bin$binLen.bed
done


#Spike-in 校正(非必需)
#Notice:ChromeSize species

chromSize="/home/lyo/index/CUTtagIndex/danRer11.chrom.sizes" #danRer11.chrom.sizes were downloaded form UCSC 自己做的失败了
cat filenames.txt | while read i
do 
seqDepthDouble=`samtools view -F 0x04 ./${i}_spikein.sam | wc -l`
seqDepth=$((seqDepthDouble/2))
if [[ "$seqDepth" -gt "1" ]]; then
    
    mkdir -p $projPath/sam/bedgraph
    scale_factor=`echo "10000 / $seqDepth" | bc -l`
    echo "Scaling factor for ${i} is: $scale_factor!"
    bedtools genomecov -bg -scale $scale_factor -i $projPath/sam/bed/${i}.fragments.bed -g $chromSize > $projPath/sam/bedgraph/${i}.fragments.normalized.bedgraph
    
fi
done
#也可以不校正,直接测,毕竟这里都是单个样本。
bedtools genomecov -bg -i /home/lyo/WQBioinfo/sam/bed/K27ac.fragments.bed -g /home/lyo/index/CUTtagIndex/danRer11.chrom.sizes > /home/lyo/WQBioinfo/sam/bedgraph/K27ac.fragments.normalized.bedgraph

###SEACR call peaks #注意Input的位置
mkdir -p SEACRpeak/strigent
seacr="/home/lyo/app/SEACR_1.3.sh"
cat filenames.txt | while read i
do
bash $seacr $projPath/sam/bedgraph/${i}.fragments.normalized.bedgraph \
     $projPath/sam/bedgraph/Input.fragments.normalized.bedgraph \
     non stringent $projPath/sam/SEACRpeak/strigent/${i}_seacr_control.peaks
bash $seacr $projPath/sam/bedgraph/${i}.fragments.normalized.bedgraph 0.01 non stringent $projPath/sam/SEACRpeak/strigent/${i}_seacr_top0.01.peaks
done

mkdir -p SEACRpeak/relaxed
seacr="/home/lyo/app/SEACR_1.3.sh"
cat filenames.txt | while read i
do

bash $seacr $projPath/sam/bedgraph/${i}.fragments.normalized.bedgraph \
     $projPath/sam/bedgraph/Input.fragments.normalized.bedgraph \
     non relaxed $projPath/sam/SEACRpeak/relaxed/${i}_seacr_control.peaks
bash $seacr $projPath/sam/bedgraph/${i}.fragments.normalized.bedgraph 0.01 non relaxed $projPath/sam/SEACRpeak/relaxed/${i}_seacr_top0.01.peaks
done

#bedtools 做热图
#这一步卡了很久,谨记
#斑马鱼bed文件最终版本,行不行就这了,之前一直报错,这次行了,都因为这个bed的问题,bed解决方法很重要
#参考 还是google强大 http://xuchunhui.top/2020/11/05/%E4%B8%BAdeepTools%E7%9A%84computeMatrix%E5%88%9B%E5%BB%BAgene.bed%E6%96%87%E4%BB%B6/
computeMatrix reference-point --referencePoint TSS  -p 8 -a 3000 -b 3000 -R /home/lyo/index/GRCz11_cM_bed_gtf/gene.bed -S /home/lyo/WQBioinfo/sam/bw/K27ac.bw K9ac.bw K9cr.bw --skipZeros -o /home/lyo/WQBioinfo/heatmap/K27ac_K9ac_cr_geneBed_TSS.gz 
#第一次尝试采用的是https://github.com/saketkc/gencode_regions/blob/master/data/GRCz11/v96/gene.bed.gz
plotHeatmap -m K27ac_K9ac_cr_geneBed_TSS.gz     -o  K27ac_K9ac_cr_geneBed_TSS.pdf  --colorMap Blues --sortUsingSamples 1 


你可能感兴趣的:(斑马鱼CUT&tag分析,全自动脚本)