gatk 分析流程 2020-06-02

#!/bin/sh

#PBS -N DL_FQ
#PBS -l nodes=1:ppn=16
#PBS -o /home/omics2017/hx_du/test1.log
#PBS -e /home/omics2017/hx_du/test1.err
#PBS -q batch
#PBS -l walltime=120:00:00

wkdir=/bios-store2/hx_du/fq_td/HBB300K_200501/200501/PE

src=/home/omics2017/hx_du/program
ref=/bios-store2/hx_du/NIPT/reference
bedfile=/bios-store2/hx_du/HBB300K_200327/test20200331/all_target_segments_covered_by_probes_HBB_v1_hg19_191108105530.bed

cd ${wkdir}


samples=$(for i in $(ls *_R1*.fastq.gz);do echo "${i%%_*}";done)

for ID in $samples
do
    echo $ID


    if [ -f ${ID}*R2*.fastq.gz ];then
        echo ${ID} "R2 exist"
        java -jar $src/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 16 $wkdir/${ID}*R1*.fastq.gz $wkdir/${ID}*R2*.fastq.gz ${ID}_1_paired.fq.gz ${ID}_1_unpaired.fq.gz ${ID}_2_paired.fq.gz ${ID}_2_unpaired.fq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36


        # PE ===========================
        $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_1_paired.fq.gz ${ID}_2_paired.fq.gz > ${ID}_pe.sam
        $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_1_unpaired.fq.gz > ${ID}_s1.sam
        $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_2_unpaired.fq.gz > ${ID}_s2.sam

        picard MergeSamFiles \
            TMP_DIR=${wkdir}/tmp \
            INPUT=${ID}_pe.sam \
            INPUT=${ID}_s1.sam \
            INPUT=${ID}_s2.sam \
            OUTPUT=${ID}.bam
        rm ${ID}*.sam
        # #===========================

    else
        echo ${ID} "R2 noexist"

        java -jar $src/Trimmomatic-0.38/trimmomatic-0.38.jar SE -threads 16 $wkdir/${ID}*R1*.fastq.gz ${ID}.trimmed.fq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
        $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}.trimmed.fq.gz > ${ID}_s1.sam
        picard MergeSamFiles \
            INPUT=${ID}_s1.sam \
            OUTPUT=${ID}.bam
        rm ${ID}*.sam
    fi
    
    picard AddOrReplaceReadGroups \
        TMP_DIR=${wkdir}/tmp \
        INPUT=${ID}.bam \
        OUTPUT=${ID}.add.bam \
        RGID=group1 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=${ID}

    picard SortSam \
        TMP_DIR=${wkdir}/tmp \
        INPUT=${ID}.add.bam \
        OUTPUT=${ID}.add.sort.bam \
        SORT_ORDER=coordinate

    picard MarkDuplicates \
        INPUT=${ID}.add.sort.bam \
        OUTPUT=${ID}.add.dedup.bam \
        METRICS_FILE=metrics.txt \
        REMOVE_DUPLICATES=true

    picard BuildBamIndex \
        TMP_DIR=${wkdir}/tmp \
        INPUT=${ID}.add.dedup.bam

    # # Not essential ===========================
    gatk BaseRecalibrator \
        -R $ref/genome.fa \
        -I ${ID}.add.dedup.bam \
        --known-sites $ref/hg19/dbsnp_138.hg19.vcf \
        --known-sites $ref/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf \
        -O ${ID}.recal.grp \
        -L $bedfile

    gatk ApplyBQSR \
        -R $ref/genome.fa \
        -I ${ID}.add.dedup.bam \
        -bqsr ${ID}.recal.grp \
        -O ${ID}.recal.bam \
        -L $bedfile
    # # ===========================

    # # Singel Sample ===========================
    gatk HaplotypeCaller \
        -R $ref/genome.fa \
        -I ${ID}.recal.bam \
        -O ${ID}.raw.g.vcf \
        -ERC GVCF \
        -L $bedfile
    # # =========================================   

    # # Multiple Sample ===========================
    gatk GenotypeGVCFs \
        -R $ref/genome.fa \
        -V ${ID}.raw.g.vcf \
        --include-non-variant-sites true \
        -O ${ID}_genotype.vcf \
        -L $bedfile



    gatk SelectVariants \
        -R $ref/genome.fa \
        -V ${ID}_genotype.vcf \
        --select-type-to-include SNP \
        --select-type-to-include NO_VARIATION \
        -O ${ID}_raw_snps.vcf \
        -L $bedfile
    #===========================

    gatk VariantFiltration \
        -V ${ID}_raw_snps.vcf \
        --filter-expression "QD < 2.0 || FS > 200.0 || SOR > 10.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || DP < 10" \
        --filter-name "Filter" \
        -O ${ID}.filter.vcf

    cat ${ID}.filter.vcf | grep -v '\./\.' > ${ID}.filter.gt.vcf


done

你可能感兴趣的:(gatk 分析流程 2020-06-02)