Adam学习13之Fasta/Fastq/SAM/BAM文件格式数据读取

0.代码(读取方法):

package org.bdgenomics.adamLocal.algorithms.test

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.bdgenomics.adam.rdd.ADAMContext
import htsjdk.samtools.ValidationStringency
//import scala.collection.parallel.Foreach

object hs38DHL1Test1 {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("readFileFromFaFq").setMaster("local")
    val sc = new SparkContext(conf)

    val ac = new ADAMContext(sc)
    val fileFa = "file/adam/hs38DH/hs38DHL1/hs38DHL1.fa"
    //    val fileFq = "file/adam/hs38DH/hs38DHSE1L100F1.fq"
    val fileFq = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.fq"
    val fileSam = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.sam"

    val fq0 = sc.textFile(fileFa)
    println("fq0.count:" + fq0.count);

    //load Fasta
    println(fileFa + ":");
    val fa1 = ac.loadFasta(fileFa, 10000)
    var fa1Segquence = fa1.map(_.getFragmentSequence()).collect
    //        fq1Segquence.foreach(println)
    println("fa1Segquence.length:" + fa1Segquence.length);
    for (i <- 0 until fa1Segquence.length) {
      println(fa1Segquence(i));
    }
    println();
    fa1.foreach(println)

    //load Fastq
    println();
    println(fileFq + ":");
    val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)
    var fq1Segquence = fq1.map(_.getSequence()).collect
    //        fq1Segquence.foreach(println)
    println("fq1Segquence.length:" + fq1Segquence.length);
    for (i <- 0 until fq1Segquence.length) {
      println(fq1Segquence(i));
    }
    println();
    fq1.foreach(println)

    println("SAM:");
    val sam1=ac.loadAlignments(fileSam)
    sam1.foreach(println)
    sc.stop

  }
}


1.Fasta:

ac.loadFasta(fileFa, 10000)

源文件:

>chrUn_KN707606v1_decoy  AC:KN707606.1  gi:734691250  LN:2200  rl:unplaced  M5:20c768ac79ca38077e5012ee0e5f8333  AS:hs38d1
ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtg
ttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgag
ccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTA
CCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGC
ACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGC
CCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGG
GGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggt
tatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaa
caattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccagg
agttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaa
aaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAA
GGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttct
tctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGT
GCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTG
CTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAA
CCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCT
ACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAA
ACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaat
agtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagat
tacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtc
tcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacC
AACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGG
CTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcc
tttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCT
TCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTAT
CCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtct
cgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctc
ccgcctcagcctcccgagtagctgggactacaggcactcg


读取结果:

{"contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": "AC:KN707606.1  gi:734691250  LN:2200  rl:unplaced  M5:20c768ac79ca38077e5012ee0e5f8333  AS:hs38d1", "fragmentSequence": "ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtgttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgagccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTACCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGCACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGCCCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGGGGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggttatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaacaattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccaggagttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaaaaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAAGGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttcttctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGTGCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTGCTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAACCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCTACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAAACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaatagtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagattacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtctcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacCAACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGGCTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcctttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCTTCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTATCCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtctcgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctcccgcctcagcctcccgagtagctgggactacaggcactcg", "fragmentNumber": 0, "fragmentStartPosition": 0, "fragmentLength": 2200, "numberOfFragmentsInContig": 1}

2.Fastq:

 val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)

原数据:

@chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0/1
CTCCTCGCCA
+
2222222222

读取结果:

{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}


3.SAM:

 println("SAM:");
    val sam1=ac.loadAlignments(fileSam)
    sam1.foreach(println)

源数据:


@SQ	SN:chrUn_KN707606v1_decoy	LN:2200
@PG	ID:bwa	PN:bwa	VN:0.7.12-r1039	CL:bwa samse -f hs38DHL1F1Len10.sam hs38DHL1.fa hs38DHL1F1Len10.sai hs38DHL1F1Len10.fq
chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0	0	chrUn_KN707606v1_decoy	1204	37	10M	*	0	0	CTCCTCGCCA	2222222222	XT:A:U	NM:i:0	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:10

读取结果:

{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}

4.BAM:

 println("BAM:");
    println("methods 1:");
    val bam1 = ac.loadAlignments(fileBam)
    bam1.foreach(println)
    println("methods 2:");
    val bam2 = ac.loadBam(fileBam)


BAM是SAM的二进制,直接打开乱码


读取结果:

BAM:
methods 1:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
methods 2:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}

BAM跟SAM读取结果是一样的,本来内容也是一样的



你可能感兴趣的:(Adam学习13之Fasta/Fastq/SAM/BAM文件格式数据读取)