call SNP from wheat RNA_seq

首先输入文件格式如下下图。


S2.png

代码如下,这个流程是根据sentieon提供的流程稍加改动而来,这里不再对每一步进行解释,请见谅,另外注意参考基因组要使用染色体分开的那个,直接整条染色体>500Mb,软件不支持:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'wheatomics'

import subprocess

with open('input.txt', 'r') as f:
    for line in f:
        line = line.strip().split()
        if len(line) > 4:
            sra1, day1, rg1, sra2, day2, rg2 = line
            print sra1, sra2
            proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos',  sra1 + '.sra'], shell=False)
            proc.wait()
            proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos', sra2 + '.sra'], shell=False)
            proc.wait()
            proc = subprocess.Popen('cat ' + sra2 + '_1.fastq' + ' >> ' + sra1 + '_1.fastq', shell=True)
            proc.wait()
            proc = subprocess.Popen('cat ' + sra2 + '_2.fastq' + ' >> ' + sra1 + '_2.fastq', shell=True)
            proc.wait()
            proc = subprocess.Popen(['shred', '-u', '-z', sra2 + '_1.fastq'], shell=False)
            proc.wait()
            proc = subprocess.Popen(['shred', '-u', '-z', sra2 + '_2.fastq'], shell=False)
            proc.wait()
        else:
            sra1, day1, rg1 = line
            proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos', sra1 + '.sra'], shell=False)
            proc.wait()
        proc = subprocess.Popen(
            ['STAR', '--twopassMode', 'Basic', '--genomeDir', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/',
             '--runThreadN', '10', '--limitSjdbInsertNsj', '5000000', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--twopass1readsN', '-1',
             '--sjdbOverhang', '100', '--readFilesIn', sra1 + '_1.fastq', sra1 + '_2.fastq',
             '--outSAMattrRGline', 'ID:' + rg1, 'SM:' + rg1, 'PL:ILLUMINA'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['mv', 'Aligned.sortedByCoord.out.bam', 'sorted.bam'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['shred', '-u', '-z', sra1 + '_1.fastq'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['shred', '-u', '-z', sra1 + '_2.fastq'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'util', 'index', 'sorted.bam'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
                                 '-t', '10', '-i', 'sorted.bam', '--algo', 'MeanQualityByCycle', 'mq_metrics.txt',
                                 '--algo', 'QualDistribution', 'qd_metrics.txt', '--algo', 'GCBias', '--summary', 'gc_summary.txt',
                                 'gc_metrics.txt', '--algo', 'AlignmentStat', '--adapter_seq', "''", 'aln_metrics.txt',
                                 '--algo', 'InsertSizeMetricAlgo', 'is_metrics.txt'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'plot', 'metrics', '-o', rg1 + '-metrics-report.pdf', 'gc=gc_metrics.txt',
                                 'qd=qd_metrics.txt', 'mq=mq_metrics.txt', 'isize=is_metrics.txt'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'driver', '-t', '10', '-i', 'sorted.bam', '--algo', 'LocusCollector',
                                 '--fun', 'score_info', 'score.txt'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'driver', '-t', '10', '-i', 'sorted.bam', '--algo', 'Dedup', '--rmdup',
                                 '--score_info', 'score.txt', '--metrics', 'dedup_metrics.txt', 'deduped.bam'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
                                 '-t', '10', '-i', 'deduped.bam', '--algo', 'RNASplitReadsAtJunction', '--reassign_mapq',
                                 '255:60', 'splitted.bam'], shell=False)
        proc.wait()
        proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
                                 '-t', '10', '-i', 'splitted.bam', '--algo', 'Haplotyper',
                                 '--trim_soft_clip', '--emit_mode', 'gvcf', '--emit_conf', '20', '--call_conf', '20',
                                  rg1 + 'g.vcf.gz'], shell=False)
        proc.wait()

最后合并gvcf生成vcf
sentieon driver -r /data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta -t 10 --algo GVCFtyper MS1_rna_seq.vcf *g.vcf.gz

你可能感兴趣的:(call SNP from wheat RNA_seq)