# 准备:
测序数据:fastq格式;RBFOX2 HepG2测序数据 - RBFOX2
STAR的基因组索引:UCSC下载fasta格式
STAR的重复元件索引:RepBase下载fasta格式
barcodes文件:fasta格式
- PE: yeolabbarcodes_20170101.fasta
- SE: a_adapters.fasta 或者
InvRNA*_adapters.fasta
chrom.sizes文件:UCSC下载,tab分割,第一列染色体名,第二列染色体长度,hg19 chrom.sizes
# 流程概述:
eclipdemux用于PE测序的样本拆分和提取UMI;umi_tools用于SE测序的UMI提取
cutadapt修剪adapters
STAR比对到重复元件和筛选
比对筛选之后的测序数据到基因组
去除PCR产生的重复:SE测序使用umi_tools,常规工具可以使用barcodecollapsepe.py
(paired-end only) Merges multiple inline barcodes and filters R1 (uses only R2 for peak calling)
Calls enriched peak regions (peak clusters) with CLIPPER
Uses size-matched input sample to normalize and calculate fold-change enrichment within enriched peak regions with custom perl scripts (overlap_peakfi_with_bam_PE.pl, peakscompress.pl)
## 1. eclipdemux用于PE测序的样本拆分和提取UMI;umi_tools用于SE测序的UMI提取
# SE 鉴定UMI
umi_tools extract \
--random-seed 1 \
--bc-pattern NNNNNNNNNN \
--log EXAMPLE_SE.rep1_clip.---.--.metrics \ --stdin file_R1.fastq.gz \
--stdout EXAMPLE_SE.rep1.umi.r1.fq
# PE 样本拆分和鉴定UMI
eclipdemux \
--metrics EXAMPLE_PE.rep1_clip.---.--.metrics \ --expectedbarcodeida C01 \ --expectedbarcodeidb D8f \
--fastq_1 file_R1.fastq.gz \
--fastq_2 file_R2.fastq.gz \
--newname rep2_clip \
--dataset EXAMPLE_PE \
--barcodesfile yeolabbarcodes_20170101.fasta \ --length 5
## ## 2. Cutadapt 去除adapters
因为可能发生两次adapter连接事件,所以进行两次Cutadapt剪切
Cutadapt round 1:
cutadapt \
-f fastq \
--match-read-wildcards \
--times 1 \
-e 0.1 \
-O 1 \
--quality-cutoff 6 \
-m 18 \
-a NNNNNAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ -g CTTCCGATCTACAAGTT \
-g CTTCCGATCTTGGTCCT \
-A AACTTGTAGATCGGA \
-A AGGACCAAGATCGGA \
-A ACTTGTAGATCGGAA \
-A GGACCAAGATCGGAA \
-A CTTGTAGATCGGAAG \
-A GACCAAGATCGGAAG \
-A TTGTAGATCGGAAGA \
-A ACCAAGATCGGAAGA \
-A TGTAGATCGGAAGAG \
-A CCAAGATCGGAAGAG \
-A GTAGATCGGAAGAGC \
-A CAAGATCGGAAGAGC \
-A TAGATCGGAAGAGCG \
-A AAGATCGGAAGAGCG \
-A AGATCGGAAGAGCGT \
-A GATCGGAAGAGCGTC \
-A ATCGGAAGAGCGTCG \
-A TCGGAAGAGCGTCGT \
-A CGGAAGAGCGTCGTG \
-A GGAAGAGCGTCGTGT \
-o EXAMPLE_PE.rep2_clip.C01.r1.fqTr.fq \
-p EXAMPLE_PE.rep2_clip.C01.r2.fqTr.fq \ EXAMPLE_PE.rep2_clip.C01.r1.fq.gz \ EXAMPLE_PE.rep2_clip.C01.r2.fq.gz
Fastqc round 1
fastqc -t 2 --extract -k 7 EXAMPLE_PE.rep2_clip.C01.r1.fqTr.fq -o . fastqc -t 2 --extract -k 7 EXAMPLE_PE.rep2_clip.C01.r2.fqTr.fq –o .
Cutadapt round 2
cutadapt \
-f fastq \
--match-read-wildcards \
--times 1 \
-e 0.1 \
-O 5 \
--quality-cutoff 6 \
-m 18 \
-A AACTTGTAGATCGGA \
-A AGGACCAAGATCGGA \
-A ACTTGTAGATCGGAA \
-A GGACCAAGATCGGAA \
-A CTTGTAGATCGGAAG \
-A GACCAAGATCGGAAG \
-A TTGTAGATCGGAAGA \
-A ACCAAGATCGGAAGA \
-A TGTAGATCGGAAGAG \
-A CCAAGATCGGAAGAG \
-A GTAGATCGGAAGAGC \
-A CAAGATCGGAAGAGC \
-A TAGATCGGAAGAGCG \
-A AAGATCGGAAGAGCG \
-A AGATCGGAAGAGCGT \
-A GATCGGAAGAGCGTC \
-A ATCGGAAGAGCGTCG \
-A TCGGAAGAGCGTCGT \
-A CGGAAGAGCGTCGTG \
-A GGAAGAGCGTCGTGT \
-o EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.fq \ -p EXAMPLE_PE.rep2_clip.C01.r2.fqTrTr.fq \ EXAMPLE_PE.rep2_clip.C01.r1.fqTr.fq \ EXAMPLE_PE.rep2_clip.C01.r2.fqTr.fq
Fastqc round 2
fastqc -t 2 --extract -k 7 EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.fq -o . fastqc -t 2 --extract -k 7 EXAMPLE_PE.rep2_clip.C01.r2.fqTrTr.fq –o .
Fastq-sort
fastq-sort --id EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.fq > EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.fq
fastq-sort --id EXAMPLE_PE.rep2_clip.C01.r2.fqTrTr.fq > EXAMPLE_PE.rep2_clip.C01.r2.fqTrTr.sorted.fq
## 3. STAR比对到重复元件和筛选
#STAR rmRe
STAR \
--runMode alignReads \
--runThreadN 8 \
--genomeDir homo_sapiens_repbase_v2 \
--genomeLoad NoSharedMemory \
--alignEndsType EndToEnd \
--outSAMunmapped Within \
--outFilterMultimapNmax 30 \
--outFilterMultimapScoreRange 1 \
--outFileNamePrefix EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.STAR \ --outSAMtype BAM Unsorted \
--outFilterType BySJout \
--outBAMcompression 10 \
--outReadsUnmapped Fastx \
--outFilterScoreMin 10 \
--outSAMattrRGline ID:foo \
--outSAMattributes All \
--outSAMmode Full \
--outStd Log \
--readFilesIn EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.fq EXAMPLE_PE.rep2_clip.C01.r2.fqTrTr.sorted.fq
#Re-name files: re-name repeat-mapped outputs
mv EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.STARAligned.out.bam
EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-mapped.bam
mv EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.STARUnmapped.out.mate1 EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.fq
mv EXAMPLE_PE.rep2_clip.C01.r1.fqTrTr.sorted.STARUnmapped.out.mate2 EXAMPLE_PE.rep2_clip.C01.r2.fq.repeat-unmapped.fq
# Fastq-sort
fastq-sort --id EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.fq > EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.sorted.fq fastq-sort --id EXAMPLE_PE.rep2_clip.C01.r2.fq.repeat-unmapped.fq > EXAMPLE_PE.rep2_clip.C01.r2.fq.repeat-unmapped.sorted.fq
## 4. 比对筛选之后的测序数据到基因组
#STAR genome mapping: Takes output from STAR rmRep. Maps unique reads to the human genome
STAR \
--runMode alignReads \
--runThreadN 8 \
--genomeDir /stage/hg19_star_sjdb \
--genomeLoad NoSharedMemory \
--readFilesIn \
EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.sorted.fq \ EXAMPLE_PE.rep2_clip.C01.r2.fq.repeat-unmapped.sorted.fq \
--outSAMunmapped Within \
--outFilterMultimapNmax 1 \
--outFilterMultimapScoreRange 1 \
--outFileNamePrefix EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.sorted.STAR \
--outSAMattributes All \
--outSAMtype BAM Unsorted \
--outFilterType BySJout \
--outReadsUnmapped Fastx \
--outFilterScoreMin 10 \
--outSAMattrRGline ID:foo \
--outStd Log \
--alignEndsType EndToEnd \
--outBAMcompression 10 \
--outSAMmode Full
# Re-name BAM: rename genome-mapped outputs
mv EXAMPLE_PE.rep2_clip.C01.r1.fq.repeat-unmapped.sorted.STARAligned.out.bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mapped.bam
# Name sort BAM: sort output from STAR by name to ensure read pairs are adjacent. samtools sort -n -o EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.bam
EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mapped.bam
## 5. 去除PCR产生的重复:SE测序使用umi_tools,常规工具可以使用barcodecollapsepe.py
# Barcode_collapse_pe (PE): takes output from STAR genome mapping.
barcodecollapsepe.py \
-o EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDup.bam \
-m EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDup.metrics \ -b EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.bam
# Position sort BAM: Takes output from barcode collapse PE (or from SE namesort bam). Sorts resulting bam file for use downstream.
samtools sort -o EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDup.bam
# Barcode_collapse_se (SE): takes output from STAR genome mapping. Use umi_tools dedup to identify the extracted random-mer from the previous step and perform PCR duplicate removal.
umi_tools dedup \
--random-seed 1 \
---I EXAMPLE_SE.rep1_clip.umi.r1.fq.genome-mappedSoSo.bam \
--method unique \
--output-stats EXAMPLE_SE.rep1_clip.umi.r1.fq.genome-mappedSoSo.txt \ -S EXAMPLE_SE.rep1_clip.umi.r1.fq.genome-mappedSoSo.rmDup.bam
#Samtools index: Takes output from sortSam, makes bam index for use downstream.
samtools index EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDupSo.bam
## 6. (paired-end only) Merges multiple inline barcodes and filters R1 (uses only R2 for peak calling)
# Samtools merge (PE only): Takes inputs from multiple final bam files. Merges the two technical replicates for further downstream analysis.
samtools merge EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.bam EXAMPLE_PE.rep2_clip.D8f.r1.fq.genome- mappedSo.rmDupSo.bam
# Samtools index: Takes output from sortSam, makes bam index for use downstream. samtools index EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.bam
## 7.Calls enriched peak regions (peak clusters) with CLIPPER
# Samtools view (PE only): Takes output from samtools merge. Only outputs the second read in each pair for use with a single stranded peak caller. This is the final bam file to perform analysis on.
# -f 128: 提取read pair的read 2
samtools view -f 128 -b -o EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.bam
# Make normalized read density bigwig files. Use --direction f for SE clip as reads are not reversed.
makebigwigfiles \
--bw_pos EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.norm.pos.bw \
--bw_neg EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.norm.neg.bw \
--bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDupSo.merged.r2.bam \ --genome hg19.chrom.sizes \
--direction r
# Clipper: Takes results from samtools view. Calls peaks on those files.
clipper \
--species hg19 \
--bam EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDupSo.merged.r2.bam \ --save-pickle \
--outfile EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.peakClusters.bed
## 8. Uses size-matched input sample to normalize and calculate fold-change enrichment within enriched peak regions with custom perl scripts (overlap_peakfi_with_bam_PE.pl, peakscompress.pl)
# Input normalization: Compares the number of reads within the IP sample to the number of reads within the size-matched INPUT sample across Clipper-called peak clusters. This step is performed both within this pipeline as well as within the merge_peaks pipeline using the same perl scripts.
samtools view -cF 4 EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.bam > ip_mapped_readnum.txt samtools view -cF 4 EXAMPLE_PE.rep2_input.NIL.r1.fq.genome- mappedSo.rmDupSo.r2.bam > input_mapped_readnum.txt
overlap_peakfi_with_bam_PE.pl \ EXAMPLE_PE.rep2_clip.C01.r1.fq.genome-mappedSo.rmDupSo.merged.r2.bam \ EXAMPLE_PE.rep2_input.NIL.r1.fq.genome-mappedSo.rmDupSo.r2.bam \ EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.peakClusters.bed \
ip_mapped_readnum.txt \
input_mapped_readnum.txt \
EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.peakClusters.normed.bed
perl compress_l2foldenrpeakfi_for_replicate_overlapping_bedformat.pl \ EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.peakClusters.normed.bed \ EXAMPLE_PE.rep2_clip.C01.r1.fq.genome- mappedSo.rmDupSo.merged.r2.peakClusters.normed.compressed.bed
原文
eCLIP pipeline
eCLIP_analysisSOP_v2.2