0.代码(读取方法):
package org.bdgenomics.adamLocal.algorithms.test import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.bdgenomics.adam.rdd.ADAMContext import htsjdk.samtools.ValidationStringency //import scala.collection.parallel.Foreach object hs38DHL1Test1 { def main(args: Array[String]) { val conf = new SparkConf().setAppName("readFileFromFaFq").setMaster("local") val sc = new SparkContext(conf) val ac = new ADAMContext(sc) val fileFa = "file/adam/hs38DH/hs38DHL1/hs38DHL1.fa" // val fileFq = "file/adam/hs38DH/hs38DHSE1L100F1.fq" val fileFq = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.fq" val fileSam = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.sam" val fq0 = sc.textFile(fileFa) println("fq0.count:" + fq0.count); //load Fasta println(fileFa + ":"); val fa1 = ac.loadFasta(fileFa, 10000) var fa1Segquence = fa1.map(_.getFragmentSequence()).collect // fq1Segquence.foreach(println) println("fa1Segquence.length:" + fa1Segquence.length); for (i <- 0 until fa1Segquence.length) { println(fa1Segquence(i)); } println(); fa1.foreach(println) //load Fastq println(); println(fileFq + ":"); val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT) var fq1Segquence = fq1.map(_.getSequence()).collect // fq1Segquence.foreach(println) println("fq1Segquence.length:" + fq1Segquence.length); for (i <- 0 until fq1Segquence.length) { println(fq1Segquence(i)); } println(); fq1.foreach(println) println("SAM:"); val sam1=ac.loadAlignments(fileSam) sam1.foreach(println) sc.stop } }
1.Fasta:
ac.loadFasta(fileFa, 10000)
源文件:
>chrUn_KN707606v1_decoy AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1 ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtg ttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgag ccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTA CCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGC ACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGC CCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGG GGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggt tatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaa caattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccagg agttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaa aaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAA GGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttct tctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGT GCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTG CTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAA CCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCT ACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAA ACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaat agtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagat tacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtc tcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacC AACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGG CTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcc tttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCT TCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTAT CCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtct cgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctc ccgcctcagcctcccgagtagctgggactacaggcactcg
{"contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": "AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1", "fragmentSequence": "ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtgttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgagccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTACCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGCACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGCCCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGGGGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggttatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaacaattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccaggagttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaaaaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAAGGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttcttctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGTGCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTGCTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAACCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCTACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAAACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaatagtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagattacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtctcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacCAACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGGCTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcctttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCTTCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTATCCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtctcgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctcccgcctcagcctcccgagtagctgggactacaggcactcg", "fragmentNumber": 0, "fragmentStartPosition": 0, "fragmentLength": 2200, "numberOfFragmentsInContig": 1}
val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)
@chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0/1 CTCCTCGCCA + 2222222222
{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
println("SAM:"); val sam1=ac.loadAlignments(fileSam) sam1.foreach(println)
@SQ SN:chrUn_KN707606v1_decoy LN:2200 @PG ID:bwa PN:bwa VN:0.7.12-r1039 CL:bwa samse -f hs38DHL1F1Len10.sam hs38DHL1.fa hs38DHL1F1Len10.sai hs38DHL1F1Len10.fq chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0 0 chrUn_KN707606v1_decoy 1204 37 10M * 0 0 CTCCTCGCCA 2222222222 XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:10
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
println("BAM:"); println("methods 1:"); val bam1 = ac.loadAlignments(fileBam) bam1.foreach(println) println("methods 2:"); val bam2 = ac.loadBam(fileBam)
BAM是SAM的二进制,直接打开乱码
读取结果:
BAM: methods 1: {"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} methods 2: {"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}