julia 获取某个染色体上的BAM

@ZHANG先森_5850

上篇文章被问到,julia如何只计算某个染色体上的插入序列长度,其实这个问题的核心是如何获取某个染色体上的BAM record,这里我们结合BGZFStreams来读取index,直接跳到特定染色体的位置,并提取bam,代码如下:

using XAM
using BGZFStreams
using BioGenerics

function get_chunks(reader::BAM.Reader, chrom::String)
    id = findfirst(x -> x == chrom, reader.refseqnames)
    offsets = reader.index.index.data[id][2][1] ## position information
    tmp = reader.index.index.data[id][3] ## total read numbers in each index
    reads = tmp.n_mapped + tmp.n_unmapped
    (offsets, reads)  ## start position and read-length so the input bam file should be coordinate sorted!!
end

function get_chr_bams(reader::BAM.Reader, chrom::String, IO::BAM.Writer)
    startSite, readnumbers = get_chunks(reader, chrom)
    i = 1
    seek(reader.stream, startSite)  ## make bam to seek the correct start site
    for record in reader
        if i > readnumbers
            break
        else
            write(IO, record)
        end
        i += 1
    end
end

function main(bamfile::String, chrom::String, out::String)
    reader = open(BAM.Reader, bamfile, index=string(bamfile, ".bai"))
    h = BioGenerics.header(reader)
    writer = BAM.Writer(BGZFStream(out, "w"), h)
    get_chr_bams(reader, chrom, writer)
    close(reader)
    close(writer)
end

main("16T.cs.rmdup.sort.bam", "chr2", "julia_test_chr2.bam")

你可能感兴趣的:(julia 获取某个染色体上的BAM)