Adam学习之6源码解读kmer.scala
代码:
package testAdam import org.apache.spark._ import org.bdgenomics.adam.rdd.ADAMContext import org.bdgenomics.adam.projections.{AlignmentRecordField, Projection} object kmer { def main(args:Array[String]){ val conf=new SparkConf().setAppName("test Adam kmer").setMaster("local") // val conf=new SparkConf().setAppName("test Adam kmer") val sc=new SparkContext(conf) val ac = new ADAMContext(sc) // Load alignments from disk //val reads = ac.loadAlignments("/data/NA21144.chrom11.ILLUMINA.adam", // val reads = ac.loadAlignments("/xubo/adam/output/small.adam", val reads = ac.loadAlignments("hdfs://<strong>Master</strong>:9000/xubo/adam/output/small.adam", projection = Some( Projection( AlignmentRecordField.sequence, AlignmentRecordField.readMapped, AlignmentRecordField.mapq ) ) ) // Generate, count and sort 21-mers val kmers =reads.flatMap(_.getSequence.sliding(21).map(k => (k, 1L))).reduceByKey(_ + _).map(_.swap).sortByKey(ascending = false) kmers.take(10).foreach(println) // Print the top 10 most common 21-mers } }
<strong>Master需要改成真实IP</strong>
源码解读:
1.loadAlignments:loadAlignments是ADAMContext的函数
val reads = ac.loadAlignments("hdfs://Master:9000/xubo/adam/output/small.adam", projection = Some( Projection( AlignmentRecordField.sequence, AlignmentRecordField.readMapped, AlignmentRecordField.mapq ) ) )
def loadAlignments( filePath: String, projection: Option[Schema] = None, filePath2Opt: Option[String] = None, recordGroupOpt: Option[String] = None, stringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = LoadAlignmentRecords.time { if (filePath.endsWith(".sam") || filePath.endsWith(".bam")) { log.info("Loading " + filePath + " as SAM/BAM and converting to AlignmentRecords. Projection is ignored.") loadBam(filePath) } else if (filePath.endsWith(".ifq")) { log.info("Loading " + filePath + " as interleaved FASTQ and converting to AlignmentRecords. Projection is ignored.") loadInterleavedFastq(filePath) } else if (filePath.endsWith(".fq") || filePath.endsWith(".fastq")) { log.info("Loading " + filePath + " as unpaired FASTQ and converting to AlignmentRecords. Projection is ignored.") loadFastq(filePath, filePath2Opt, recordGroupOpt, stringency) } else if (filePath.endsWith(".fa") || filePath.endsWith(".fasta")) { log.info("Loading " + filePath + " as FASTA and converting to AlignmentRecords. Projection is ignored.") import ADAMContext._ loadFasta(filePath, fragmentLength = 10000).toReads } else if (filePath.endsWith("contig.adam")) { log.info("Loading " + filePath + " as Parquet of NucleotideContigFragment and converting to AlignmentRecords. Projection is ignored.") loadParquet[NucleotideContigFragment](filePath).toReads } else { log.info("Loading " + filePath + " as Parquet of AlignmentRecords.") loadParquetAlignments(filePath, None, projection) } }
scala> reads.foreach(println) {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "CTTTATTTTTATTTTTAAGGTTTTTTTTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCCACCGCCCAGACTGGAG", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TGTATCTTCCTCCCCTGCTGTATGTTTCCTGCCCTCAAACATCACACTCCACGTTCTTCAGCTTTAGGACTTGGA", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TTTAATAAATGTTGATTGTCCTATTTAATTATTCTCAACTTTCCGATTTTATTTCCCATGTAACAGTGTTGTTTT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TAAAATGCCCCCATCTTCCCAGAGCTGCCAGCCCTCACAATGCCAACAGCTAAATGTACCCAAGTGTTACTGAAC", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 24, "readName": null, "sequence": "TACAGGCACCCACCATCATGCCCAGCTAATTTTTGTATTTTTGTAGAAACGGGGTTTCACCATGTTGGCCCAGCT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "GCTCACTGCAGCCTCAACCTCCTGGGCCCAAGTGATTTCATCTTATTTTTGGAAAAAAAAACAAACTAAACCAAA", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 28, "readName": null, "sequence": "TTTCTTTTTCTTTCTTTCTTTCTTTCTTTCTTTTTCTTTCTTTCTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TCATGTAGCATGCATATGGCTAACGGCAAAGTGAGGGAGGAATAATTATAGTAATAATCACAGTGATGACGTGGA", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "GCTCAGGCCTTGCAAGAATCTCTACTGCCCAACAAGTCCCTACAAGATGGCATTTAAAAGCAGTCCCTCACGCAC", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "CCTAGAGAAGCTCCCACTAGGGCTGCAGTCAATTCCCAGGTCTTAGGTGCTGAGCAGTGGGAGGTGGTGGCCATG", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "AAATAAAGTTTGGCTTTCAGTTGTAACTTTGAATATCTTTATCACAGTTATTTAAAGCCTTTAAAAAGCTTTAAT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TGTGTAACTAACATAATTGGCACTGTCCCTGTAAATTCAAATTGGATATCCTCCCAAATTTTATTTAAGCAATTG", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "TTTATTTTTTGAGCATGAAAGTAATATATGCTCAGTGTAAACAATTAGGTCATTATAAATATATTTAACAGGAAT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 35, "readName": null, "sequence": "CTCAGGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGACTACAGGCATGAGGCACCGCGCCTGGCCAGGACT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "GACAAGATAGTACTTGAGCTAAGCCTTGCAGGTTGAGTAGGATTATTCTAGTGGAATTTAGGGAAACGATGTGCA", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "CTACTCTCATTGACTGTTCAATGCCTATACAAGTAAAACTTTACCAGCACCCAAGTCAAAAAGAAAAAAAAGGGG", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "CTCATTCTCTCTCCTGCTGCACTGTGAAGAGGTGCCTGTTGCCAAGAGTATAAGTTTCCTGAGGCCTCCCAGGCC", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 60, "readName": null, "sequence": "AAATTAAACAGCTCGTTTAACTGATAATCCATACTATATTTGAGTAGGGCTGTCACATGGTTGGAACCTCCGGTT", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null} {"readNum": 0, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": 40, "readName": null, "sequence": "AGACTGGGTCTCACTATGTTGCCTAGGCTGGTCTCAAACTCCTGGGCTCAAGTGATCCATCTCTGCCTTCCAAAG", "qual": null, "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": false, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
2.getSequence:对一中的数据读取出Sequence,没看到源码,在Arvo中,还没下载
scala> val a0=reads.map(_.getSequence) a0: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at map at <console>:27 scala> a0.foreach(println) GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA CTTTATTTTTATTTTTAAGGTTTTTTTTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCCACCGCCCAGACTGGAG TGTATCTTCCTCCCCTGCTGTATGTTTCCTGCCCTCAAACATCACACTCCACGTTCTTCAGCTTTAGGACTTGGA TTTAATAAATGTTGATTGTCCTATTTAATTATTCTCAACTTTCCGATTTTATTTCCCATGTAACAGTGTTGTTTT TAAAATGCCCCCATCTTCCCAGAGCTGCCAGCCCTCACAATGCCAACAGCTAAATGTACCCAAGTGTTACTGAAC TACAGGCACCCACCATCATGCCCAGCTAATTTTTGTATTTTTGTAGAAACGGGGTTTCACCATGTTGGCCCAGCT GCTCACTGCAGCCTCAACCTCCTGGGCCCAAGTGATTTCATCTTATTTTTGGAAAAAAAAACAAACTAAACCAAA TTTCTTTTTCTTTCTTTCTTTCTTTCTTTCTTTTTCTTTCTTTCTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCT TCATGTAGCATGCATATGGCTAACGGCAAAGTGAGGGAGGAATAATTATAGTAATAATCACAGTGATGACGTGGA GCTCAGGCCTTGCAAGAATCTCTACTGCCCAACAAGTCCCTACAAGATGGCATTTAAAAGCAGTCCCTCACGCAC CCTAGAGAAGCTCCCACTAGGGCTGCAGTCAATTCCCAGGTCTTAGGTGCTGAGCAGTGGGAGGTGGTGGCCATG AAATAAAGTTTGGCTTTCAGTTGTAACTTTGAATATCTTTATCACAGTTATTTAAAGCCTTTAAAAAGCTTTAAT TGTGTAACTAACATAATTGGCACTGTCCCTGTAAATTCAAATTGGATATCCTCCCAAATTTTATTTAAGCAATTG TTTATTTTTTGAGCATGAAAGTAATATATGCTCAGTGTAAACAATTAGGTCATTATAAATATATTTAACAGGAAT CTCAGGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGACTACAGGCATGAGGCACCGCGCCTGGCCAGGACT GACAAGATAGTACTTGAGCTAAGCCTTGCAGGTTGAGTAGGATTATTCTAGTGGAATTTAGGGAAACGATGTGCA CTACTCTCATTGACTGTTCAATGCCTATACAAGTAAAACTTTACCAGCACCCAAGTCAAAAAGAAAAAAAAGGGG CTCATTCTCTCTCCTGCTGCACTGTGAAGAGGTGCCTGTTGCCAAGAGTATAAGTTTCCTGAGGCCTCCCAGGCC AAATTAAACAGCTCGTTTAACTGATAATCCATACTATATTTGAGTAGGGCTGTCACATGGTTGGAACCTCCGGTT AGACTGGGTCTCACTATGTTGCCTAGGCTGGTCTCAAACTCCTGGGCTCAAGTGATCCATCTCTGCCTTCCAAAG
scala> val a1=reads.map(_.getSequence.sliding(21)) a1: org.apache.spark.rdd.RDD[Iterator[String]] = MapPartitionsRDD[10] at map at <console>:27 scala> for(i<-a1){ while(i.hasNext){print(i.next()+" ")} ;println()} GTATAAGAGCAGCCTTATTCC TATAAGAGCAGCCTTATTCCT ATAAGAGCAGCCTTATTCCTA TAAGAGCAGCCTTATTCCTAT AAGAGCAGCCTTATTCCTATT AGAGCAGCCTTATTCCTATTT GAGCAGCCTTATTCCTATTTA AGCAGCCTTATTCCTATTTAT GCAGCCTTATTCCTATTTATA CAGCCTTATTCCTATTTATAA AGCCTTATTCCTATTTATAAT GCCTTATTCCTATTTATAATC CCTTATTCCTATTTATAATCA CTTATTCCTATTTATAATCAG TTATTCCTATTTATAATCAGG TATTCCTATTTATAATCAGGG ATTCCTATTTATAATCAGGGT TTCCTATTTATAATCAGGGTG TCCTATTTATAATCAGGGTGA CCTATTTATAATCAGGGTGAA CTATTTATAATCAGGGTGAAA TATTTATAATCAGGGTGAAAC ATTTATAATCAGGGTGAAACA TTTATAATCAGGGTGAAACAC TTATAATCAGGGTGAAACACC TATAATCAGGGTGAAACACCT ATAATCAGGGTGAAACACCTG TAATCAGGGTGAAACACCTGT AATCAGGGTGAAACACCTGTG ATCAGGGTGAAACACCTGTGC TCAGGGTGAAACACCTGTGCC CAGGGTGAAACACCTGTGCCA AGGGTGAAACACCTGTGCCAA GGGTGAAACACCTGTGCCAAT GGTGAAACACCTGTGCCAATG GTGAAACACCTGTGCCAATGC TGAAACACCTGTGCCAATGCC GAAACACCTGTGCCAATGCCA AAACACCTGTGCCAATGCCAA AACACCTGTGCCAATGCCAAG ACACCTGTGCCAATGCCAAGA CACCTGTGCCAATGCCAAGAC ACCTGTGCCAATGCCAAGACA CCTGTGCCAATGCCAAGACAG CTGTGCCAATGCCAAGACAGG TGTGCCAATGCCAAGACAGGG GTGCCAATGCCAAGACAGGGG TGCCAATGCCAAGACAGGGGT GCCAATGCCAAGACAGGGGTG CCAATGCCAAGACAGGGGTGC CAATGCCAAGACAGGGGTGCC AATGCCAAGACAGGGGTGCCA ATGCCAAGACAGGGGTGCCAA TGCCAAGACAGGGGTGCCAAG GCCAAGACAGGGGTGCCAAGA CTTTATTTTTATTTTTAAGGT TTTATTTTTATTTTTAAGGTT TTATTTTTATTTTTAAGGTTT TATTTTTATTTTTAAGGTTTT ATTTTTATTTTTAAGGTTTTT TTTTTATTTTTAAGGTTTTTT TTTTATTTTTAAGGTTTTTTT TTTATTTTTAAGGTTTTTTTT TTATTTTTAAGGTTTTTTTTG TATTTTTAAGGTTTTTTTTGT ATTTTTAAGGTTTTTTTTGTT TTTTTAAGGTTTTTTTTGTTT TTTTAAGGTTTTTTTTGTTTG TTTAAGGTTTTTTTTGTTTGT TTAAGGTTTTTTTTGTTTGTT TAAGGTTTTTTTTGTTTGTTT AAGGTTTTTTTTGTTTGTTTG AGGTTTTTTTTGTTTGTTTGT GGTTTTTTTTGTTTGTTTGTT GTTTTTTTTGTTTGTTTGTTT TTTTTTTTGTTTGTTTGTTTT TTTTTTTGTTTGTTTGTTTTG TTTTTTGTTTGTTTGTTTTGA TTTTTGTTTGTTTGTTTTGAG TTTTGTTTGTTTGTTTTGAGA TTTGTTTGTTTGTTTTGAGAT TTGTTTGTTTGTTTTGAGATG TGTTTGTTTGTTTTGAGATGG GTTTGTTTGTTTTGAGATGGA TTTGTTTGTTTTGAGATGGAG TTGTTTGTTTTGAGATGGAGT TGTTTGTTTTGAGATGGAGTC GTTTGTTTTGAGATGGAGTCT TTTGTTTTGAGATGGAGTCTC TTGTTTTGAGATGGAGTCTCG TGTTTTGAGATGGAGTCTCGC GTTTTGAGATGGAGTCTCGCT TTTTGAGATGGAGTCTCGCTC TTTGAGATGGAGTCTCGCTCC TTGAGATGGAGTCTCGCTCCA TGAGATGGAGTCTCGCTCCAC GAGATGGAGTCTCGCTCCACC AGATGGAGTCTCGCTCCACCG GATGGAGTCTCGCTCCACCGC ATGGAGTCTCGCTCCACCGCC TGGAGTCTCGCTCCACCGCCC GGAGTCTCGCTCCACCGCCCA GAGTCTCGCTCCACCGCCCAG AGTCTCGCTCCACCGCCCAGA GTCTCGCTCCACCGCCCAGAC TCTCGCTCCACCGCCCAGACT CTCGCTCCACCGCCCAGACTG TCGCTCCACCGCCCAGACTGG CGCTCCACCGCCCAGACTGGA GCTCCACCGCCCAGACTGGAG TGTATCTTCCTCCCCTGCTGT GTATCTTCCTCCCCTGCTGTA TATCTTCCTCCCCTGCTGTAT ATCTTCCTCCCCTGCTGTATG TCTTCCTCCCCTGCTGTATGT CTTCCTCCCCTGCTGTATGTT TTCCTCCCCTGCTGTATGTTT TCCTCCCCTGCTGTATGTTTC CCTCCCCTGCTGTATGTTTCC CTCCCCTGCTGTATGTTTCCT TCCCCTGCTGTATGTTTCCTG CCCCTGCTGTATGTTTCCTGC CCCTGCTGTATGTTTCCTGCC CCTGCTGTATGTTTCCTGCCC CTGCTGTATGTTTCCTGCCCT TGCTGTATGTTTCCTGCCCTC GCTGTATGTTTCCTGCCCTCA CTGTATGTTTCCTGCCCTCAA TGTATGTTTCCTGCCCTCAAA GTATGTTTCCTGCCCTCAAAC TATGTTTCCTGCCCTCAAACA ATGTTTCCTGCCCTCAAACAT TGTTTCCTGCCCTCAAACATC GTTTCCTGCCCTCAAACATCA TTTCCTGCCCTCAAACATCAC TTCCTGCCCTCAAACATCACA TCCTGCCCTCAAACATCACAC CCTGCCCTCAAACATCACACT CTGCCCTCAAACATCACACTC TGCCCTCAAACATCACACTCC GCCCTCAAACATCACACTCCA CCCTCAAACATCACACTCCAC CCTCAAACATCACACTCCACG CTCAAACATCACACTCCACGT TCAAACATCACACTCCACGTT CAAACATCACACTCCACGTTC AAACATCACACTCCACGTTCT AACATCACACTCCACGTTCTT ACATCACACTCCACGTTCTTC CATCACACTCCACGTTCTTCA ATCACACTCCACGTTCTTCAG TCACACTCCACGTTCTTCAGC CACACTCCACGTTCTTCAGCT ACACTCCACGTTCTTCAGCTT CACTCCACGTTCTTCAGCTTT ACTCCACGTTCTTCAGCTTTA CTCCACGTTCTTCAGCTTTAG TCCACGTTCTTCAGCTTTAGG CCACGTTCTTCAGCTTTAGGA CACGTTCTTCAGCTTTAGGAC ACGTTCTTCAGCTTTAGGACT CGTTCTTCAGCTTTAGGACTT GTTCTTCAGCTTTAGGACTTG TTCTTCAGCTTTAGGACTTGG TCTTCAGCTTTAGGACTTGGA TTTAATAAATGTTGATTGTCC TTAATAAATGTTGATTGTCCT TAATAAATGTTGATTGTCCTA AATAAATGTTGATTGTCCTAT ATAAATGTTGATTGTCCTATT TAAATGTTGATTGTCCTATTT AAATGTTGATTGTCCTATTTA AATGTTGATTGTCCTATTTAA ATGTTGATTGTCCTATTTAAT TGTTGATTGTCCTATTTAATT GTTGATTGTCCTATTTAATTA TTGATTGTCCTATTTAATTAT TGATTGTCCTATTTAATTATT GATTGTCCTATTTAATTATTC ATTGTCCTATTTAATTATTCT TTGTCCTATTTAATTATTCTC TGTCCTATTTAATTATTCTCA GTCCTATTTAATTATTCTCAA TCCTATTTAATTATTCTCAAC CCTATTTAATTATTCTCAACT CTATTTAATTATTCTCAACTT TATTTAATTATTCTCAACTTT ATTTAATTATTCTCAACTTTC TTTAATTATTCTCAACTTTCC TTAATTATTCTCAACTTTCCG TAATTATTCTCAACTTTCCGA AATTATTCTCAACTTTCCGAT ATTATTCTCAACTTTCCGATT TTATTCTCAACTTTCCGATTT TATTCTCAACTTTCCGATTTT ATTCTCAACTTTCCGATTTTA TTCTCAACTTTCCGATTTTAT TCTCAACTTTCCGATTTTATT CTCAACTTTCCGATTTTATTT TCAACTTTCCGATTTTATTTC CAACTTTCCGATTTTATTTCC AACTTTCCGATTTTATTTCCC ACTTTCCGATTTTATTTCCCA CTTTCCGATTTTATTTCCCAT TTTCCGATTTTATTTCCCATG TTCCGATTTTATTTCCCATGT TCCGATTTTATTTCCCATGTA CCGATTTTATTTCCCATGTAA CGATTTTATTTCCCATGTAAC GATTTTATTTCCCATGTAACA ATTTTATTTCCCATGTAACAG TTTTATTTCCCATGTAACAGT TTTATTTCCCATGTAACAGTG TTATTTCCCATGTAACAGTGT TATTTCCCATGTAACAGTGTT ATTTCCCATGTAACAGTGTTG TTTCCCATGTAACAGTGTTGT TTCCCATGTAACAGTGTTGTT TCCCATGTAACAGTGTTGTTT CCCATGTAACAGTGTTGTTTT TAAAATGCCCCCATCTTCCCA AAAATGCCCCCATCTTCCCAG AAATGCCCCCATCTTCCCAGA AATGCCCCCATCTTCCCAGAG ATGCCCCCATCTTCCCAGAGC TGCCCCCATCTTCCCAGAGCT GCCCCCATCTTCCCAGAGCTG CCCCCATCTTCCCAGAGCTGC CCCCATCTTCCCAGAGCTGCC CCCATCTTCCCAGAGCTGCCA CCATCTTCCCAGAGCTGCCAG CATCTTCCCAGAGCTGCCAGC ATCTTCCCAGAGCTGCCAGCC TCTTCCCAGAGCTGCCAGCCC CTTCCCAGAGCTGCCAGCCCT TTCCCAGAGCTGCCAGCCCTC TCCCAGAGCTGCCAGCCCTCA CCCAGAGCTGCCAGCCCTCAC CCAGAGCTGCCAGCCCTCACA CAGAGCTGCCAGCCCTCACAA AGAGCTGCCAGCCCTCACAAT GAGCTGCCAGCCCTCACAATG AGCTGCCAGCCCTCACAATGC GCTGCCAGCCCTCACAATGCC CTGCCAGCCCTCACAATGCCA TGCCAGCCCTCACAATGCCAA GCCAGCCCTCACAATGCCAAC CCAGCCCTCACAATGCCAACA CAGCCCTCACAATGCCAACAG AGCCCTCACAATGCCAACAGC GCCCTCACAATGCCAACAGCT CCCTCACAATGCCAACAGCTA CCTCACAATGCCAACAGCTAA CTCACAATGCCAACAGCTAAA TCACAATGCCAACAGCTAAAT CACAATGCCAACAGCTAAATG ACAATGCCAACAGCTAAATGT CAATGCCAACAGCTAAATGTA AATGCCAACAGCTAAATGTAC ATGCCAACAGCTAAATGTACC TGCCAACAGCTAAATGTACCC GCCAACAGCTAAATGTACCCA CCAACAGCTAAATGTACCCAA CAACAGCTAAATGTACCCAAG AACAGCTAAATGTACCCAAGT ACAGCTAAATGTACCCAAGTG CAGCTAAATGTACCCAAGTGT AGCTAAATGTACCCAAGTGTT GCTAAATGTACCCAAGTGTTA CTAAATGTACCCAAGTGTTAC TAAATGTACCCAAGTGTTACT AAATGTACCCAAGTGTTACTG AATGTACCCAAGTGTTACTGA ATGTACCCAAGTGTTACTGAA TGTACCCAAGTGTTACTGAAC TACAGGCACCCACCATCATGC ACAGGCACCCACCATCATGCC CAGGCACCCACCATCATGCCC AGGCACCCACCATCATGCCCA GGCACCCACCATCATGCCCAG GCACCCACCATCATGCCCAGC CACCCACCATCATGCCCAGCT ACCCACCATCATGCCCAGCTA CCCACCATCATGCCCAGCTAA CCACCATCATGCCCAGCTAAT CACCATCATGCCCAGCTAATT ACCATCATGCCCAGCTAATTT CCATCATGCCCAGCTAATTTT CATCATGCCCAGCTAATTTTT ATCATGCCCAGCTAATTTTTG TCATGCCCAGCTAATTTTTGT CATGCCCAGCTAATTTTTGTA ATGCCCAGCTAATTTTTGTAT TGCCCAGCTAATTTTTGTATT GCCCAGCTAATTTTTGTATTT CCCAGCTAATTTTTGTATTTT CCAGCTAATTTTTGTATTTTT CAGCTAATTTTTGTATTTTTG AGCTAATTTTTGTATTTTTGT GCTAATTTTTGTATTTTTGTA CTAATTTTTGTATTTTTGTAG TAATTTTTGTATTTTTGTAGA AATTTTTGTATTTTTGTAGAA ATTTTTGTATTTTTGTAGAAA TTTTTGTATTTTTGTAGAAAC TTTTGTATTTTTGTAGAAACG TTTGTATTTTTGTAGAAACGG TTGTATTTTTGTAGAAACGGG TGTATTTTTGTAGAAACGGGG GTATTTTTGTAGAAACGGGGT TATTTTTGTAGAAACGGGGTT ATTTTTGTAGAAACGGGGTTT TTTTTGTAGAAACGGGGTTTC TTTTGTAGAAACGGGGTTTCA TTTGTAGAAACGGGGTTTCAC TTGTAGAAACGGGGTTTCACC TGTAGAAACGGGGTTTCACCA GTAGAAACGGGGTTTCACCAT TAGAAACGGGGTTTCACCATG AGAAACGGGGTTTCACCATGT GAAACGGGGTTTCACCATGTT AAACGGGGTTTCACCATGTTG AACGGGGTTTCACCATGTTGG ACGGGGTTTCACCATGTTGGC CGGGGTTTCACCATGTTGGCC GGGGTTTCACCATGTTGGCCC GGGTTTCACCATGTTGGCCCA GGTTTCACCATGTTGGCCCAG GTTTCACCATGTTGGCCCAGC TTTCACCATGTTGGCCCAGCT GCTCACTGCAGCCTCAACCTC CTCACTGCAGCCTCAACCTCC TCACTGCAGCCTCAACCTCCT CACTGCAGCCTCAACCTCCTG ACTGCAGCCTCAACCTCCTGG CTGCAGCCTCAACCTCCTGGG TGCAGCCTCAACCTCCTGGGC GCAGCCTCAACCTCCTGGGCC CAGCCTCAACCTCCTGGGCCC AGCCTCAACCTCCTGGGCCCA GCCTCAACCTCCTGGGCCCAA CCTCAACCTCCTGGGCCCAAG CTCAACCTCCTGGGCCCAAGT TCAACCTCCTGGGCCCAAGTG CAACCTCCTGGGCCCAAGTGA AACCTCCTGGGCCCAAGTGAT ACCTCCTGGGCCCAAGTGATT CCTCCTGGGCCCAAGTGATTT CTCCTGGGCCCAAGTGATTTC TCCTGGGCCCAAGTGATTTCA CCTGGGCCCAAGTGATTTCAT CTGGGCCCAAGTGATTTCATC TGGGCCCAAGTGATTTCATCT GGGCCCAAGTGATTTCATCTT GGCCCAAGTGATTTCATCTTA GCCCAAGTGATTTCATCTTAT CCCAAGTGATTTCATCTTATT CCAAGTGATTTCATCTTATTT CAAGTGATTTCATCTTATTTT AAGTGATTTCATCTTATTTTT AGTGATTTCATCTTATTTTTG GTGATTTCATCTTATTTTTGG TGATTTCATCTTATTTTTGGA GATTTCATCTTATTTTTGGAA ATTTCATCTTATTTTTGGAAA TTTCATCTTATTTTTGGAAAA TTCATCTTATTTTTGGAAAAA TCATCTTATTTTTGGAAAAAA CATCTTATTTTTGGAAAAAAA ATCTTATTTTTGGAAAAAAAA TCTTATTTTTGGAAAAAAAAA CTTATTTTTGGAAAAAAAAAC TTATTTTTGGAAAAAAAAACA TATTTTTGGAAAAAAAAACAA ATTTTTGGAAAAAAAAACAAA TTTTTGGAAAAAAAAACAAAC TTTTGGAAAAAAAAACAAACT TTTGGAAAAAAAAACAAACTA TTGGAAAAAAAAACAAACTAA TGGAAAAAAAAACAAACTAAA GGAAAAAAAAACAAACTAAAC GAAAAAAAAACAAACTAAACC AAAAAAAAACAAACTAAACCA AAAAAAAACAAACTAAACCAA AAAAAAACAAACTAAACCAAA TTTCTTTTTCTTTCTTTCTTT TTCTTTTTCTTTCTTTCTTTC TCTTTTTCTTTCTTTCTTTCT CTTTTTCTTTCTTTCTTTCTT TTTTTCTTTCTTTCTTTCTTT TTTTCTTTCTTTCTTTCTTTC TTTCTTTCTTTCTTTCTTTCT TTCTTTCTTTCTTTCTTTCTT TCTTTCTTTCTTTCTTTCTTT CTTTCTTTCTTTCTTTCTTTC TTTCTTTCTTTCTTTCTTTCT TTCTTTCTTTCTTTCTTTCTT TCTTTCTTTCTTTCTTTCTTT CTTTCTTTCTTTCTTTCTTTT TTTCTTTCTTTCTTTCTTTTT TTCTTTCTTTCTTTCTTTTTC TCTTTCTTTCTTTCTTTTTCT CTTTCTTTCTTTCTTTTTCTT TTTCTTTCTTTCTTTTTCTTT TTCTTTCTTTCTTTTTCTTTC TCTTTCTTTCTTTTTCTTTCT CTTTCTTTCTTTTTCTTTCTT TTTCTTTCTTTTTCTTTCTTT TTCTTTCTTTTTCTTTCTTTC TCTTTCTTTTTCTTTCTTTCT CTTTCTTTTTCTTTCTTTCTT TTTCTTTTTCTTTCTTTCTTT TTCTTTTTCTTTCTTTCTTTC TCTTTTTCTTTCTTTCTTTCT CTTTTTCTTTCTTTCTTTCTC TTTTTCTTTCTTTCTTTCTCT TTTTCTTTCTTTCTTTCTCTT TTTCTTTCTTTCTTTCTCTTT TTCTTTCTTTCTTTCTCTTTC TCTTTCTTTCTTTCTCTTTCT CTTTCTTTCTTTCTCTTTCTT TTTCTTTCTTTCTCTTTCTTT TTCTTTCTTTCTCTTTCTTTC TCTTTCTTTCTCTTTCTTTCT CTTTCTTTCTCTTTCTTTCTT TTTCTTTCTCTTTCTTTCTTT TTCTTTCTCTTTCTTTCTTTC TCTTTCTCTTTCTTTCTTTCT CTTTCTCTTTCTTTCTTTCTT TTTCTCTTTCTTTCTTTCTTT TTCTCTTTCTTTCTTTCTTTC TCTCTTTCTTTCTTTCTTTCT CTCTTTCTTTCTTTCTTTCTT TCTTTCTTTCTTTCTTTCTTT CTTTCTTTCTTTCTTTCTTTC TTTCTTTCTTTCTTTCTTTCT TTCTTTCTTTCTTTCTTTCTT TCTTTCTTTCTTTCTTTCTTT CTTTCTTTCTTTCTTTCTTTC TTTCTTTCTTTCTTTCTTTCT TCATGTAGCATGCATATGGCT CATGTAGCATGCATATGGCTA ATGTAGCATGCATATGGCTAA TGTAGCATGCATATGGCTAAC GTAGCATGCATATGGCTAACG TAGCATGCATATGGCTAACGG AGCATGCATATGGCTAACGGC GCATGCATATGGCTAACGGCA CATGCATATGGCTAACGGCAA ATGCATATGGCTAACGGCAAA TGCATATGGCTAACGGCAAAG GCATATGGCTAACGGCAAAGT CATATGGCTAACGGCAAAGTG ATATGGCTAACGGCAAAGTGA TATGGCTAACGGCAAAGTGAG ATGGCTAACGGCAAAGTGAGG TGGCTAACGGCAAAGTGAGGG GGCTAACGGCAAAGTGAGGGA GCTAACGGCAAAGTGAGGGAG CTAACGGCAAAGTGAGGGAGG TAACGGCAAAGTGAGGGAGGA AACGGCAAAGTGAGGGAGGAA ACGGCAAAGTGAGGGAGGAAT CGGCAAAGTGAGGGAGGAATA GGCAAAGTGAGGGAGGAATAA GCAAAGTGAGGGAGGAATAAT CAAAGTGAGGGAGGAATAATT AAAGTGAGGGAGGAATAATTA AAGTGAGGGAGGAATAATTAT AGTGAGGGAGGAATAATTATA GTGAGGGAGGAATAATTATAG TGAGGGAGGAATAATTATAGT GAGGGAGGAATAATTATAGTA AGGGAGGAATAATTATAGTAA GGGAGGAATAATTATAGTAAT GGAGGAATAATTATAGTAATA GAGGAATAATTATAGTAATAA AGGAATAATTATAGTAATAAT GGAATAATTATAGTAATAATC GAATAATTATAGTAATAATCA AATAATTATAGTAATAATCAC ATAATTATAGTAATAATCACA TAATTATAGTAATAATCACAG AATTATAGTAATAATCACAGT ATTATAGTAATAATCACAGTG TTATAGTAATAATCACAGTGA TATAGTAATAATCACAGTGAT ATAGTAATAATCACAGTGATG TAGTAATAATCACAGTGATGA AGTAATAATCACAGTGATGAC GTAATAATCACAGTGATGACG TAATAATCACAGTGATGACGT AATAATCACAGTGATGACGTG ATAATCACAGTGATGACGTGG TAATCACAGTGATGACGTGGA GCTCAGGCCTTGCAAGAATCT CTCAGGCCTTGCAAGAATCTC TCAGGCCTTGCAAGAATCTCT CAGGCCTTGCAAGAATCTCTA AGGCCTTGCAAGAATCTCTAC GGCCTTGCAAGAATCTCTACT GCCTTGCAAGAATCTCTACTG CCTTGCAAGAATCTCTACTGC CTTGCAAGAATCTCTACTGCC TTGCAAGAATCTCTACTGCCC TGCAAGAATCTCTACTGCCCA GCAAGAATCTCTACTGCCCAA CAAGAATCTCTACTGCCCAAC AAGAATCTCTACTGCCCAACA AGAATCTCTACTGCCCAACAA GAATCTCTACTGCCCAACAAG AATCTCTACTGCCCAACAAGT ATCTCTACTGCCCAACAAGTC TCTCTACTGCCCAACAAGTCC CTCTACTGCCCAACAAGTCCC TCTACTGCCCAACAAGTCCCT CTACTGCCCAACAAGTCCCTA TACTGCCCAACAAGTCCCTAC ACTGCCCAACAAGTCCCTACA CTGCCCAACAAGTCCCTACAA TGCCCAACAAGTCCCTACAAG GCCCAACAAGTCCCTACAAGA CCCAACAAGTCCCTACAAGAT CCAACAAGTCCCTACAAGATG CAACAAGTCCCTACAAGATGG AACAAGTCCCTACAAGATGGC ACAAGTCCCTACAAGATGGCA CAAGTCCCTACAAGATGGCAT AAGTCCCTACAAGATGGCATT AGTCCCTACAAGATGGCATTT GTCCCTACAAGATGGCATTTA TCCCTACAAGATGGCATTTAA CCCTACAAGATGGCATTTAAA CCTACAAGATGGCATTTAAAA CTACAAGATGGCATTTAAAAG TACAAGATGGCATTTAAAAGC ACAAGATGGCATTTAAAAGCA CAAGATGGCATTTAAAAGCAG AAGATGGCATTTAAAAGCAGT AGATGGCATTTAAAAGCAGTC GATGGCATTTAAAAGCAGTCC ATGGCATTTAAAAGCAGTCCC TGGCATTTAAAAGCAGTCCCT GGCATTTAAAAGCAGTCCCTC GCATTTAAAAGCAGTCCCTCA CATTTAAAAGCAGTCCCTCAC ATTTAAAAGCAGTCCCTCACG TTTAAAAGCAGTCCCTCACGC TTAAAAGCAGTCCCTCACGCA TAAAAGCAGTCCCTCACGCAC CCTAGAGAAGCTCCCACTAGG CTAGAGAAGCTCCCACTAGGG TAGAGAAGCTCCCACTAGGGC AGAGAAGCTCCCACTAGGGCT GAGAAGCTCCCACTAGGGCTG AGAAGCTCCCACTAGGGCTGC GAAGCTCCCACTAGGGCTGCA AAGCTCCCACTAGGGCTGCAG AGCTCCCACTAGGGCTGCAGT GCTCCCACTAGGGCTGCAGTC CTCCCACTAGGGCTGCAGTCA TCCCACTAGGGCTGCAGTCAA CCCACTAGGGCTGCAGTCAAT CCACTAGGGCTGCAGTCAATT CACTAGGGCTGCAGTCAATTC ACTAGGGCTGCAGTCAATTCC CTAGGGCTGCAGTCAATTCCC TAGGGCTGCAGTCAATTCCCA AGGGCTGCAGTCAATTCCCAG GGGCTGCAGTCAATTCCCAGG GGCTGCAGTCAATTCCCAGGT GCTGCAGTCAATTCCCAGGTC CTGCAGTCAATTCCCAGGTCT TGCAGTCAATTCCCAGGTCTT GCAGTCAATTCCCAGGTCTTA CAGTCAATTCCCAGGTCTTAG AGTCAATTCCCAGGTCTTAGG GTCAATTCCCAGGTCTTAGGT TCAATTCCCAGGTCTTAGGTG CAATTCCCAGGTCTTAGGTGC AATTCCCAGGTCTTAGGTGCT ATTCCCAGGTCTTAGGTGCTG TTCCCAGGTCTTAGGTGCTGA TCCCAGGTCTTAGGTGCTGAG CCCAGGTCTTAGGTGCTGAGC CCAGGTCTTAGGTGCTGAGCA CAGGTCTTAGGTGCTGAGCAG AGGTCTTAGGTGCTGAGCAGT GGTCTTAGGTGCTGAGCAGTG GTCTTAGGTGCTGAGCAGTGG TCTTAGGTGCTGAGCAGTGGG CTTAGGTGCTGAGCAGTGGGA TTAGGTGCTGAGCAGTGGGAG TAGGTGCTGAGCAGTGGGAGG AGGTGCTGAGCAGTGGGAGGT GGTGCTGAGCAGTGGGAGGTG GTGCTGAGCAGTGGGAGGTGG TGCTGAGCAGTGGGAGGTGGT GCTGAGCAGTGGGAGGTGGTG CTGAGCAGTGGGAGGTGGTGG TGAGCAGTGGGAGGTGGTGGC GAGCAGTGGGAGGTGGTGGCC AGCAGTGGGAGGTGGTGGCCA GCAGTGGGAGGTGGTGGCCAT CAGTGGGAGGTGGTGGCCATG AAATAAAGTTTGGCTTTCAGT AATAAAGTTTGGCTTTCAGTT ATAAAGTTTGGCTTTCAGTTG TAAAGTTTGGCTTTCAGTTGT AAAGTTTGGCTTTCAGTTGTA AAGTTTGGCTTTCAGTTGTAA AGTTTGGCTTTCAGTTGTAAC GTTTGGCTTTCAGTTGTAACT TTTGGCTTTCAGTTGTAACTT TTGGCTTTCAGTTGTAACTTT TGGCTTTCAGTTGTAACTTTG GGCTTTCAGTTGTAACTTTGA GCTTTCAGTTGTAACTTTGAA CTTTCAGTTGTAACTTTGAAT TTTCAGTTGTAACTTTGAATA TTCAGTTGTAACTTTGAATAT TCAGTTGTAACTTTGAATATC CAGTTGTAACTTTGAATATCT AGTTGTAACTTTGAATATCTT GTTGTAACTTTGAATATCTTT TTGTAACTTTGAATATCTTTA TGTAACTTTGAATATCTTTAT GTAACTTTGAATATCTTTATC TAACTTTGAATATCTTTATCA AACTTTGAATATCTTTATCAC ACTTTGAATATCTTTATCACA CTTTGAATATCTTTATCACAG TTTGAATATCTTTATCACAGT TTGAATATCTTTATCACAGTT TGAATATCTTTATCACAGTTA GAATATCTTTATCACAGTTAT AATATCTTTATCACAGTTATT ATATCTTTATCACAGTTATTT TATCTTTATCACAGTTATTTA ATCTTTATCACAGTTATTTAA TCTTTATCACAGTTATTTAAA CTTTATCACAGTTATTTAAAG TTTATCACAGTTATTTAAAGC TTATCACAGTTATTTAAAGCC TATCACAGTTATTTAAAGCCT ATCACAGTTATTTAAAGCCTT TCACAGTTATTTAAAGCCTTT CACAGTTATTTAAAGCCTTTA ACAGTTATTTAAAGCCTTTAA CAGTTATTTAAAGCCTTTAAA AGTTATTTAAAGCCTTTAAAA GTTATTTAAAGCCTTTAAAAA TTATTTAAAGCCTTTAAAAAG TATTTAAAGCCTTTAAAAAGC ATTTAAAGCCTTTAAAAAGCT TTTAAAGCCTTTAAAAAGCTT TTAAAGCCTTTAAAAAGCTTT TAAAGCCTTTAAAAAGCTTTA AAAGCCTTTAAAAAGCTTTAA AAGCCTTTAAAAAGCTTTAAT TGTGTAACTAACATAATTGGC GTGTAACTAACATAATTGGCA TGTAACTAACATAATTGGCAC GTAACTAACATAATTGGCACT TAACTAACATAATTGGCACTG AACTAACATAATTGGCACTGT ACTAACATAATTGGCACTGTC CTAACATAATTGGCACTGTCC TAACATAATTGGCACTGTCCC AACATAATTGGCACTGTCCCT ACATAATTGGCACTGTCCCTG CATAATTGGCACTGTCCCTGT ATAATTGGCACTGTCCCTGTA TAATTGGCACTGTCCCTGTAA AATTGGCACTGTCCCTGTAAA ATTGGCACTGTCCCTGTAAAT TTGGCACTGTCCCTGTAAATT TGGCACTGTCCCTGTAAATTC GGCACTGTCCCTGTAAATTCA GCACTGTCCCTGTAAATTCAA CACTGTCCCTGTAAATTCAAA ACTGTCCCTGTAAATTCAAAT CTGTCCCTGTAAATTCAAATT TGTCCCTGTAAATTCAAATTG GTCCCTGTAAATTCAAATTGG TCCCTGTAAATTCAAATTGGA CCCTGTAAATTCAAATTGGAT CCTGTAAATTCAAATTGGATA CTGTAAATTCAAATTGGATAT TGTAAATTCAAATTGGATATC GTAAATTCAAATTGGATATCC TAAATTCAAATTGGATATCCT AAATTCAAATTGGATATCCTC AATTCAAATTGGATATCCTCC ATTCAAATTGGATATCCTCCC TTCAAATTGGATATCCTCCCA TCAAATTGGATATCCTCCCAA CAAATTGGATATCCTCCCAAA AAATTGGATATCCTCCCAAAT AATTGGATATCCTCCCAAATT ATTGGATATCCTCCCAAATTT TTGGATATCCTCCCAAATTTT TGGATATCCTCCCAAATTTTA GGATATCCTCCCAAATTTTAT GATATCCTCCCAAATTTTATT ATATCCTCCCAAATTTTATTT TATCCTCCCAAATTTTATTTA ATCCTCCCAAATTTTATTTAA TCCTCCCAAATTTTATTTAAG CCTCCCAAATTTTATTTAAGC CTCCCAAATTTTATTTAAGCA TCCCAAATTTTATTTAAGCAA CCCAAATTTTATTTAAGCAAT CCAAATTTTATTTAAGCAATT CAAATTTTATTTAAGCAATTG TTTATTTTTTGAGCATGAAAG TTATTTTTTGAGCATGAAAGT TATTTTTTGAGCATGAAAGTA ATTTTTTGAGCATGAAAGTAA TTTTTTGAGCATGAAAGTAAT TTTTTGAGCATGAAAGTAATA TTTTGAGCATGAAAGTAATAT TTTGAGCATGAAAGTAATATA TTGAGCATGAAAGTAATATAT TGAGCATGAAAGTAATATATG GAGCATGAAAGTAATATATGC AGCATGAAAGTAATATATGCT GCATGAAAGTAATATATGCTC CATGAAAGTAATATATGCTCA ATGAAAGTAATATATGCTCAG TGAAAGTAATATATGCTCAGT GAAAGTAATATATGCTCAGTG AAAGTAATATATGCTCAGTGT AAGTAATATATGCTCAGTGTA AGTAATATATGCTCAGTGTAA GTAATATATGCTCAGTGTAAA TAATATATGCTCAGTGTAAAC AATATATGCTCAGTGTAAACA ATATATGCTCAGTGTAAACAA TATATGCTCAGTGTAAACAAT ATATGCTCAGTGTAAACAATT TATGCTCAGTGTAAACAATTA ATGCTCAGTGTAAACAATTAG TGCTCAGTGTAAACAATTAGG GCTCAGTGTAAACAATTAGGT CTCAGTGTAAACAATTAGGTC TCAGTGTAAACAATTAGGTCA CAGTGTAAACAATTAGGTCAT AGTGTAAACAATTAGGTCATT GTGTAAACAATTAGGTCATTA TGTAAACAATTAGGTCATTAT GTAAACAATTAGGTCATTATA TAAACAATTAGGTCATTATAA AAACAATTAGGTCATTATAAA AACAATTAGGTCATTATAAAT ACAATTAGGTCATTATAAATA CAATTAGGTCATTATAAATAT AATTAGGTCATTATAAATATA ATTAGGTCATTATAAATATAT TTAGGTCATTATAAATATATT TAGGTCATTATAAATATATTT AGGTCATTATAAATATATTTA GGTCATTATAAATATATTTAA GTCATTATAAATATATTTAAC TCATTATAAATATATTTAACA CATTATAAATATATTTAACAG ATTATAAATATATTTAACAGG TTATAAATATATTTAACAGGA TATAAATATATTTAACAGGAA ATAAATATATTTAACAGGAAT CTCAGGTGATCCACCCGCCTC TCAGGTGATCCACCCGCCTCG CAGGTGATCCACCCGCCTCGG AGGTGATCCACCCGCCTCGGC GGTGATCCACCCGCCTCGGCC GTGATCCACCCGCCTCGGCCT TGATCCACCCGCCTCGGCCTC GATCCACCCGCCTCGGCCTCC ATCCACCCGCCTCGGCCTCCC TCCACCCGCCTCGGCCTCCCA CCACCCGCCTCGGCCTCCCAA CACCCGCCTCGGCCTCCCAAA ACCCGCCTCGGCCTCCCAAAG CCCGCCTCGGCCTCCCAAAGT CCGCCTCGGCCTCCCAAAGTG CGCCTCGGCCTCCCAAAGTGC GCCTCGGCCTCCCAAAGTGCT CCTCGGCCTCCCAAAGTGCTG CTCGGCCTCCCAAAGTGCTGG TCGGCCTCCCAAAGTGCTGGG CGGCCTCCCAAAGTGCTGGGA GGCCTCCCAAAGTGCTGGGAC GCCTCCCAAAGTGCTGGGACT CCTCCCAAAGTGCTGGGACTA CTCCCAAAGTGCTGGGACTAC TCCCAAAGTGCTGGGACTACA CCCAAAGTGCTGGGACTACAG CCAAAGTGCTGGGACTACAGG CAAAGTGCTGGGACTACAGGC AAAGTGCTGGGACTACAGGCA AAGTGCTGGGACTACAGGCAT AGTGCTGGGACTACAGGCATG GTGCTGGGACTACAGGCATGA TGCTGGGACTACAGGCATGAG GCTGGGACTACAGGCATGAGG CTGGGACTACAGGCATGAGGC TGGGACTACAGGCATGAGGCA GGGACTACAGGCATGAGGCAC GGACTACAGGCATGAGGCACC GACTACAGGCATGAGGCACCG ACTACAGGCATGAGGCACCGC CTACAGGCATGAGGCACCGCG TACAGGCATGAGGCACCGCGC ACAGGCATGAGGCACCGCGCC CAGGCATGAGGCACCGCGCCT AGGCATGAGGCACCGCGCCTG GGCATGAGGCACCGCGCCTGG GCATGAGGCACCGCGCCTGGC CATGAGGCACCGCGCCTGGCC ATGAGGCACCGCGCCTGGCCA TGAGGCACCGCGCCTGGCCAG GAGGCACCGCGCCTGGCCAGG AGGCACCGCGCCTGGCCAGGA GGCACCGCGCCTGGCCAGGAC GCACCGCGCCTGGCCAGGACT GACAAGATAGTACTTGAGCTA ACAAGATAGTACTTGAGCTAA CAAGATAGTACTTGAGCTAAG AAGATAGTACTTGAGCTAAGC AGATAGTACTTGAGCTAAGCC GATAGTACTTGAGCTAAGCCT ATAGTACTTGAGCTAAGCCTT TAGTACTTGAGCTAAGCCTTG AGTACTTGAGCTAAGCCTTGC GTACTTGAGCTAAGCCTTGCA TACTTGAGCTAAGCCTTGCAG ACTTGAGCTAAGCCTTGCAGG CTTGAGCTAAGCCTTGCAGGT TTGAGCTAAGCCTTGCAGGTT TGAGCTAAGCCTTGCAGGTTG GAGCTAAGCCTTGCAGGTTGA AGCTAAGCCTTGCAGGTTGAG GCTAAGCCTTGCAGGTTGAGT CTAAGCCTTGCAGGTTGAGTA TAAGCCTTGCAGGTTGAGTAG AAGCCTTGCAGGTTGAGTAGG AGCCTTGCAGGTTGAGTAGGA GCCTTGCAGGTTGAGTAGGAT CCTTGCAGGTTGAGTAGGATT CTTGCAGGTTGAGTAGGATTA TTGCAGGTTGAGTAGGATTAT TGCAGGTTGAGTAGGATTATT GCAGGTTGAGTAGGATTATTC CAGGTTGAGTAGGATTATTCT AGGTTGAGTAGGATTATTCTA GGTTGAGTAGGATTATTCTAG GTTGAGTAGGATTATTCTAGT TTGAGTAGGATTATTCTAGTG TGAGTAGGATTATTCTAGTGG GAGTAGGATTATTCTAGTGGA AGTAGGATTATTCTAGTGGAA GTAGGATTATTCTAGTGGAAT TAGGATTATTCTAGTGGAATT AGGATTATTCTAGTGGAATTT GGATTATTCTAGTGGAATTTA GATTATTCTAGTGGAATTTAG ATTATTCTAGTGGAATTTAGG TTATTCTAGTGGAATTTAGGG TATTCTAGTGGAATTTAGGGA ATTCTAGTGGAATTTAGGGAA TTCTAGTGGAATTTAGGGAAA TCTAGTGGAATTTAGGGAAAC CTAGTGGAATTTAGGGAAACG TAGTGGAATTTAGGGAAACGA AGTGGAATTTAGGGAAACGAT GTGGAATTTAGGGAAACGATG TGGAATTTAGGGAAACGATGT GGAATTTAGGGAAACGATGTG GAATTTAGGGAAACGATGTGC AATTTAGGGAAACGATGTGCA CTACTCTCATTGACTGTTCAA TACTCTCATTGACTGTTCAAT ACTCTCATTGACTGTTCAATG CTCTCATTGACTGTTCAATGC TCTCATTGACTGTTCAATGCC CTCATTGACTGTTCAATGCCT TCATTGACTGTTCAATGCCTA CATTGACTGTTCAATGCCTAT ATTGACTGTTCAATGCCTATA TTGACTGTTCAATGCCTATAC TGACTGTTCAATGCCTATACA GACTGTTCAATGCCTATACAA ACTGTTCAATGCCTATACAAG CTGTTCAATGCCTATACAAGT TGTTCAATGCCTATACAAGTA GTTCAATGCCTATACAAGTAA TTCAATGCCTATACAAGTAAA TCAATGCCTATACAAGTAAAA CAATGCCTATACAAGTAAAAC AATGCCTATACAAGTAAAACT ATGCCTATACAAGTAAAACTT TGCCTATACAAGTAAAACTTT GCCTATACAAGTAAAACTTTA CCTATACAAGTAAAACTTTAC CTATACAAGTAAAACTTTACC TATACAAGTAAAACTTTACCA ATACAAGTAAAACTTTACCAG TACAAGTAAAACTTTACCAGC ACAAGTAAAACTTTACCAGCA CAAGTAAAACTTTACCAGCAC AAGTAAAACTTTACCAGCACC AGTAAAACTTTACCAGCACCC GTAAAACTTTACCAGCACCCA TAAAACTTTACCAGCACCCAA AAAACTTTACCAGCACCCAAG AAACTTTACCAGCACCCAAGT AACTTTACCAGCACCCAAGTC ACTTTACCAGCACCCAAGTCA CTTTACCAGCACCCAAGTCAA TTTACCAGCACCCAAGTCAAA TTACCAGCACCCAAGTCAAAA TACCAGCACCCAAGTCAAAAA ACCAGCACCCAAGTCAAAAAG CCAGCACCCAAGTCAAAAAGA CAGCACCCAAGTCAAAAAGAA AGCACCCAAGTCAAAAAGAAA GCACCCAAGTCAAAAAGAAAA CACCCAAGTCAAAAAGAAAAA ACCCAAGTCAAAAAGAAAAAA CCCAAGTCAAAAAGAAAAAAA CCAAGTCAAAAAGAAAAAAAA CAAGTCAAAAAGAAAAAAAAG AAGTCAAAAAGAAAAAAAAGG AGTCAAAAAGAAAAAAAAGGG GTCAAAAAGAAAAAAAAGGGG CTCATTCTCTCTCCTGCTGCA TCATTCTCTCTCCTGCTGCAC CATTCTCTCTCCTGCTGCACT ATTCTCTCTCCTGCTGCACTG TTCTCTCTCCTGCTGCACTGT TCTCTCTCCTGCTGCACTGTG CTCTCTCCTGCTGCACTGTGA TCTCTCCTGCTGCACTGTGAA CTCTCCTGCTGCACTGTGAAG TCTCCTGCTGCACTGTGAAGA CTCCTGCTGCACTGTGAAGAG TCCTGCTGCACTGTGAAGAGG CCTGCTGCACTGTGAAGAGGT CTGCTGCACTGTGAAGAGGTG TGCTGCACTGTGAAGAGGTGC GCTGCACTGTGAAGAGGTGCC CTGCACTGTGAAGAGGTGCCT TGCACTGTGAAGAGGTGCCTG GCACTGTGAAGAGGTGCCTGT CACTGTGAAGAGGTGCCTGTT ACTGTGAAGAGGTGCCTGTTG CTGTGAAGAGGTGCCTGTTGC TGTGAAGAGGTGCCTGTTGCC GTGAAGAGGTGCCTGTTGCCA TGAAGAGGTGCCTGTTGCCAA GAAGAGGTGCCTGTTGCCAAG AAGAGGTGCCTGTTGCCAAGA AGAGGTGCCTGTTGCCAAGAG GAGGTGCCTGTTGCCAAGAGT AGGTGCCTGTTGCCAAGAGTA GGTGCCTGTTGCCAAGAGTAT GTGCCTGTTGCCAAGAGTATA TGCCTGTTGCCAAGAGTATAA GCCTGTTGCCAAGAGTATAAG CCTGTTGCCAAGAGTATAAGT CTGTTGCCAAGAGTATAAGTT TGTTGCCAAGAGTATAAGTTT GTTGCCAAGAGTATAAGTTTC TTGCCAAGAGTATAAGTTTCC TGCCAAGAGTATAAGTTTCCT GCCAAGAGTATAAGTTTCCTG CCAAGAGTATAAGTTTCCTGA CAAGAGTATAAGTTTCCTGAG AAGAGTATAAGTTTCCTGAGG AGAGTATAAGTTTCCTGAGGC GAGTATAAGTTTCCTGAGGCC AGTATAAGTTTCCTGAGGCCT GTATAAGTTTCCTGAGGCCTC TATAAGTTTCCTGAGGCCTCC ATAAGTTTCCTGAGGCCTCCC TAAGTTTCCTGAGGCCTCCCA AAGTTTCCTGAGGCCTCCCAG AGTTTCCTGAGGCCTCCCAGG GTTTCCTGAGGCCTCCCAGGC TTTCCTGAGGCCTCCCAGGCC AAATTAAACAGCTCGTTTAAC AATTAAACAGCTCGTTTAACT ATTAAACAGCTCGTTTAACTG TTAAACAGCTCGTTTAACTGA TAAACAGCTCGTTTAACTGAT AAACAGCTCGTTTAACTGATA AACAGCTCGTTTAACTGATAA ACAGCTCGTTTAACTGATAAT CAGCTCGTTTAACTGATAATC AGCTCGTTTAACTGATAATCC GCTCGTTTAACTGATAATCCA CTCGTTTAACTGATAATCCAT TCGTTTAACTGATAATCCATA CGTTTAACTGATAATCCATAC GTTTAACTGATAATCCATACT TTTAACTGATAATCCATACTA TTAACTGATAATCCATACTAT TAACTGATAATCCATACTATA AACTGATAATCCATACTATAT ACTGATAATCCATACTATATT CTGATAATCCATACTATATTT TGATAATCCATACTATATTTG GATAATCCATACTATATTTGA ATAATCCATACTATATTTGAG TAATCCATACTATATTTGAGT AATCCATACTATATTTGAGTA ATCCATACTATATTTGAGTAG TCCATACTATATTTGAGTAGG CCATACTATATTTGAGTAGGG CATACTATATTTGAGTAGGGC ATACTATATTTGAGTAGGGCT TACTATATTTGAGTAGGGCTG ACTATATTTGAGTAGGGCTGT CTATATTTGAGTAGGGCTGTC TATATTTGAGTAGGGCTGTCA ATATTTGAGTAGGGCTGTCAC TATTTGAGTAGGGCTGTCACA ATTTGAGTAGGGCTGTCACAT TTTGAGTAGGGCTGTCACATG TTGAGTAGGGCTGTCACATGG TGAGTAGGGCTGTCACATGGT GAGTAGGGCTGTCACATGGTT AGTAGGGCTGTCACATGGTTG GTAGGGCTGTCACATGGTTGG TAGGGCTGTCACATGGTTGGA AGGGCTGTCACATGGTTGGAA GGGCTGTCACATGGTTGGAAC GGCTGTCACATGGTTGGAACC GCTGTCACATGGTTGGAACCT CTGTCACATGGTTGGAACCTC TGTCACATGGTTGGAACCTCC GTCACATGGTTGGAACCTCCG TCACATGGTTGGAACCTCCGG CACATGGTTGGAACCTCCGGT ACATGGTTGGAACCTCCGGTT AGACTGGGTCTCACTATGTTG GACTGGGTCTCACTATGTTGC ACTGGGTCTCACTATGTTGCC CTGGGTCTCACTATGTTGCCT TGGGTCTCACTATGTTGCCTA GGGTCTCACTATGTTGCCTAG GGTCTCACTATGTTGCCTAGG GTCTCACTATGTTGCCTAGGC TCTCACTATGTTGCCTAGGCT CTCACTATGTTGCCTAGGCTG TCACTATGTTGCCTAGGCTGG CACTATGTTGCCTAGGCTGGT ACTATGTTGCCTAGGCTGGTC CTATGTTGCCTAGGCTGGTCT TATGTTGCCTAGGCTGGTCTC ATGTTGCCTAGGCTGGTCTCA TGTTGCCTAGGCTGGTCTCAA GTTGCCTAGGCTGGTCTCAAA TTGCCTAGGCTGGTCTCAAAC TGCCTAGGCTGGTCTCAAACT GCCTAGGCTGGTCTCAAACTC CCTAGGCTGGTCTCAAACTCC CTAGGCTGGTCTCAAACTCCT TAGGCTGGTCTCAAACTCCTG AGGCTGGTCTCAAACTCCTGG GGCTGGTCTCAAACTCCTGGG GCTGGTCTCAAACTCCTGGGC CTGGTCTCAAACTCCTGGGCT TGGTCTCAAACTCCTGGGCTC GGTCTCAAACTCCTGGGCTCA GTCTCAAACTCCTGGGCTCAA TCTCAAACTCCTGGGCTCAAG CTCAAACTCCTGGGCTCAAGT TCAAACTCCTGGGCTCAAGTG CAAACTCCTGGGCTCAAGTGA AAACTCCTGGGCTCAAGTGAT AACTCCTGGGCTCAAGTGATC ACTCCTGGGCTCAAGTGATCC CTCCTGGGCTCAAGTGATCCA TCCTGGGCTCAAGTGATCCAT CCTGGGCTCAAGTGATCCATC CTGGGCTCAAGTGATCCATCT TGGGCTCAAGTGATCCATCTC GGGCTCAAGTGATCCATCTCT GGCTCAAGTGATCCATCTCTG GCTCAAGTGATCCATCTCTGC CTCAAGTGATCCATCTCTGCC TCAAGTGATCCATCTCTGCCT CAAGTGATCCATCTCTGCCTT AAGTGATCCATCTCTGCCTTC AGTGATCCATCTCTGCCTTCC GTGATCCATCTCTGCCTTCCA TGATCCATCTCTGCCTTCCAA GATCCATCTCTGCCTTCCAAA ATCCATCTCTGCCTTCCAAAG
sliding函数:
Sequence:75(length) =>1 GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA K-mer:21(length)=>55 GTATAAGAGCAGCCTTATTCC TATAAGAGCAGCCTTATTCCT ATAAGAGCAGCCTTATTCCTA TAAGAGCAGCCTTATTCCTAT AAGAGCAGCCTTATTCCTATT AGAGCAGCCTTATTCCTATTT GAGCAGCCTTATTCCTATTTA AGCAGCCTTATTCCTATTTAT GCAGCCTTATTCCTATTTATA CAGCCTTATTCCTATTTATAA AGCCTTATTCCTATTTATAAT GCCTTATTCCTATTTATAATC CCTTATTCCTATTTATAATCA CTTATTCCTATTTATAATCAG TTATTCCTATTTATAATCAGG TATTCCTATTTATAATCAGGG ATTCCTATTTATAATCAGGGT TTCCTATTTATAATCAGGGTG TCCTATTTATAATCAGGGTGA CCTATTTATAATCAGGGTGAA CTATTTATAATCAGGGTGAAA TATTTATAATCAGGGTGAAAC ATTTATAATCAGGGTGAAACA TTTATAATCAGGGTGAAACAC TTATAATCAGGGTGAAACACC TATAATCAGGGTGAAACACCT ATAATCAGGGTGAAACACCTG TAATCAGGGTGAAACACCTGT AATCAGGGTGAAACACCTGTG ATCAGGGTGAAACACCTGTGC TCAGGGTGAAACACCTGTGCC CAGGGTGAAACACCTGTGCCA AGGGTGAAACACCTGTGCCAA GGGTGAAACACCTGTGCCAAT GGTGAAACACCTGTGCCAATG GTGAAACACCTGTGCCAATGC TGAAACACCTGTGCCAATGCC GAAACACCTGTGCCAATGCCA AAACACCTGTGCCAATGCCAA AACACCTGTGCCAATGCCAAG ACACCTGTGCCAATGCCAAGA CACCTGTGCCAATGCCAAGAC ACCTGTGCCAATGCCAAGACA CCTGTGCCAATGCCAAGACAG CTGTGCCAATGCCAAGACAGG TGTGCCAATGCCAAGACAGGG GTGCCAATGCCAAGACAGGGG TGCCAATGCCAAGACAGGGGT GCCAATGCCAAGACAGGGGTG CCAATGCCAAGACAGGGGTGC CAATGCCAAGACAGGGGTGCC AATGCCAAGACAGGGGTGCCA ATGCCAAGACAGGGGTGCCAA TGCCAAGACAGGGGTGCCAAG GCCAAGACAGGGGTGCCAAGA k-mer.count=Sequence.length-k+1=75-21+1=55
4.其他部分:很好理解了,就是对3中的串进行map,每个计数为1,然后再进行reduceByKey操作,最后进行排序,具体是先将k-v交换,然后再sortByKey
val kmers =reads.flatMap(_.getSequence.sliding(21).map(k => (k, 1L))).reduceByKey(_ + _).map(_.swap).sortByKey(ascending = false) kmers.take(10).foreach(println)打印和保存:
<pre name="code" class="plain" style="font-size: 13.3333px;">
<pre name="code" class="plain" style="font-size: 13.3333px;">scala> kmers.foreach(println)
<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;">scala> kmers.saveAsTextFile("hdfs://219.219.220.149:9000/xubo/adam/output/smallkmers.adam")</span>
(4,TCTTTCTTTCTTTCTTTCTTT) (4,TTTCTTTCTTTCTTTCTTTCT) (3,CTTTCTTTCTTTCTTTCTTTC) (3,TTCTTTCTTTCTTTCTTTCTT) (2,TCTTTTTCTTTCTTTCTTTCT) (2,TTCTTTTTCTTTCTTTCTTTC) (2,TTTCTTTTTCTTTCTTTCTTT) (1,ATTGGATATCCTCCCAAATTT) (1,AGGCATGAGGCACCGCGCCTG) (1,CTACTGCCCAACAAGTCCCTA) (1,TGGAATTTAGGGAAACGATGT) (1,GGGCTGCAGTCAATTCCCAGG) (1,ATGTAGCATGCATATGGCTAA) (1,TGTAGCATGCATATGGCTAAC) (1,CTCTCCTGCTGCACTGTGAAG) (1,GTTCTTCAGCTTTAGGACTTG) (1,TTCTTTCTCTTTCTTTCTTTC) (1,AACTGATAATCCATACTATAT) (1,GAAGCTCCCACTAGGGCTGCA) (1,GTAAACAATTAGGTCATTATA) (1,ATTTCATCTTATTTTTGGAAA) (1,TTTAAAAGCAGTCCCTCACGC) (1,GTTTTGAGATGGAGTCTCGCT) (1,TTTGTATTTTTGTAGAAACGG) (1,GTGCTGGGACTACAGGCATGA) (1,GCTCACTGCAGCCTCAACCTC) (1,TGTATCTTCCTCCCCTGCTGT) (1,AACTAACATAATTGGCACTGT) (1,CATTCTCTCTCCTGCTGCACT) (1,ATTATAGTAATAATCACAGTG) (1,ACCCAAGTCAAAAAGAAAAAA) (1,TTTTGTATTTTTGTAGAAACG) (1,CAGGCCTTGCAAGAATCTCTA) (1,ACTTGAGCTAAGCCTTGCAGG) (1,TGCCCAGCTAATTTTTGTATT) (1,TGATTTCATCTTATTTTTGGA) (1,TTAAAAGCAGTCCCTCACGCA) (1,CCACGTTCTTCAGCTTTAGGA) (1,AGGGCTGCAGTCAATTCCCAG) (1,TTTCTTTCTTTCTCTTTCTTT) (1,GGGTGAAACACCTGTGCCAAT) (1,CCATCATGCCCAGCTAATTTT) (1,CCCTGCTGTATGTTTCCTGCC) (1,CTGCAGCCTCAACCTCCTGGG) (1,GCTGCCAGCCCTCACAATGCC) (1,GTTTGTTTGTTTTGAGATGGA) (1,ATTAGGTCATTATAAATATAT) (1,TTCAATGCCTATACAAGTAAA) (1,CAAACATCACACTCCACGTTC) (1,TATTTTTAAGGTTTTTTTTGT) (1,TAGTACTTGAGCTAAGCCTTG) (1,CAATGCCAACAGCTAAATGTA) (1,TCCTCCCAAATTTTATTTAAG) (1,CTTTCTTTCTTTCTTTTTCTT) (1,AATGTTGATTGTCCTATTTAA) (1,AAGAGTATAAGTTTCCTGAGG) (1,GCAAGAATCTCTACTGCCCAA) (1,TTCAAATTGGATATCCTCCCA) (1,CTCCCAAATTTTATTTAAGCA) (1,CCAGCACCCAAGTCAAAAAGA) (1,TAGGTGCTGAGCAGTGGGAGG) (1,TGAGCAGTGGGAGGTGGTGGC) (1,TGTTGATTGTCCTATTTAATT) (1,CCTCAAACATCACACTCCACG) (1,TTTATTTTTATTTTTAAGGTT) (1,GGAGGAATAATTATAGTAATA) (1,GTAGCATGCATATGGCTAACG) (1,TGATTGTCCTATTTAATTATT) (1,TTTTTTTTGTTTGTTTGTTTT) (1,AAATAAAGTTTGGCTTTCAGT) (1,CAATTAGGTCATTATAAATAT) (1,CTCCCAAAGTGCTGGGACTAC) (1,GAGATGGAGTCTCGCTCCACC) (1,TCGGCCTCCCAAAGTGCTGGG) (1,GGCATTTAAAAGCAGTCCCTC) (1,ATGTACCCAAGTGTTACTGAA) (1,GTATGTTTCCTGCCCTCAAAC) (1,TTATTCTCAACTTTCCGATTT) (1,GCAGGTTGAGTAGGATTATTC) (1,TGGGTCTCACTATGTTGCCTA) (1,TTTATAATCAGGGTGAAACAC) (1,CCCACCATCATGCCCAGCTAA) (1,TCCCAGGTCTTAGGTGCTGAG) (1,GCACCCAAGTCAAAAAGAAAA) (1,GTGCCTGTTGCCAAGAGTATA) (1,CCTTGCAGGTTGAGTAGGATT) (1,GTATAAGTTTCCTGAGGCCTC) (1,TGTACCCAAGTGTTACTGAAC) (1,CTCCCACTAGGGCTGCAGTCA) (1,CACCCAAGTCAAAAAGAAAAA) (1,ATTCCTATTTATAATCAGGGT) (1,GCAGCCTTATTCCTATTTATA) (1,TCCTCCCCTGCTGTATGTTTC) (1,ATTTCCCATGTAACAGTGTTG) (1,TCTCGCTCCACCGCCCAGACT) (1,AGGAATAATTATAGTAATAAT) (1,TCCCCTGCTGTATGTTTCCTG) (1,GCCTTATTCCTATTTATAATC) (1,ATATATGCTCAGTGTAAACAA) (1,GAGGAATAATTATAGTAATAA) (1,TTTACCAGCACCCAAGTCAAA) (1,GCCCAAGTGATTTCATCTTAT) (1,CACTCCACGTTCTTCAGCTTT) (1,TCCCATGTAACAGTGTTGTTT) (1,AATTGGCACTGTCCCTGTAAA) (1,TGGCATTTAAAAGCAGTCCCT) (1,TACAGGCATGAGGCACCGCGC) (1,GACTGGGTCTCACTATGTTGC) (1,ACAGTTATTTAAAGCCTTTAA) (1,AAAACTTTACCAGCACCCAAG) (1,GTTATTTAAAGCCTTTAAAAA) (1,ATCTTCCTCCCCTGCTGTATG) (1,TCTCAAACTCCTGGGCTCAAG) (1,TTAATTATTCTCAACTTTCCG) (1,CCAAATTTTATTTAAGCAATT) (1,CCCACTAGGGCTGCAGTCAAT) (1,AACTTTCCGATTTTATTTCCC) (1,GATTATTCTAGTGGAATTTAG) (1,TGAGGGAGGAATAATTATAGT) (1,GAGAAGCTCCCACTAGGGCTG) (1,CCATCTTCCCAGAGCTGCCAG) (1,CTCAAGTGATCCATCTCTGCC) (1,ACACCTGTGCCAATGCCAAGA) (1,TCCTGCCCTCAAACATCACAC) (1,TGCAGTCAATTCCCAGGTCTT) (1,TATATGCTCAGTGTAAACAAT) (1,ATATCCTCCCAAATTTTATTT) (1,GTCAAAAAGAAAAAAAAGGGG) (1,CTGTATGTTTCCTGCCCTCAA) (1,TGCCAGCCCTCACAATGCCAA) (1,CCTGTAAATTCAAATTGGATA) (1,ATTTGAGTAGGGCTGTCACAT) (1,GGCCTTGCAAGAATCTCTACT) (1,TCACTGCAGCCTCAACCTCCT) (1,AAATGCCCCCATCTTCCCAGA) (1,AAGCTCCCACTAGGGCTGCAG) (1,TATAATCAGGGTGAAACACCT) (1,CTTATTCCTATTTATAATCAG) (1,AACAAGTCCCTACAAGATGGC) (1,AAACAGCTCGTTTAACTGATA) (1,CCCGCCTCGGCCTCCCAAAGT) (1,CTCACAATGCCAACAGCTAAA) (1,GTAGAAACGGGGTTTCACCAT) (1,TCTCATTGACTGTTCAATGCC) (1,GACAAGATAGTACTTGAGCTA) (1,TTGTCCTATTTAATTATTCTC) (1,CAAGTCAAAAAGAAAAAAAAG) (1,AAACTCCTGGGCTCAAGTGAT) (1,TCCACCCGCCTCGGCCTCCCA) (1,CTCATTGACTGTTCAATGCCT) (1,TTTTATTTTTAAGGTTTTTTT) (1,AAACTTTACCAGCACCCAAGT) (1,TGAGATGGAGTCTCGCTCCAC) (1,TCAATTCCCAGGTCTTAGGTG) (1,ATAAAGTTTGGCTTTCAGTTG) (1,TACAGGCACCCACCATCATGC) (1,TCCTATTTATAATCAGGGTGA) (1,GAGCAGCCTTATTCCTATTTA) (1,GGGAGGAATAATTATAGTAAT) (1,AGATGGAGTCTCGCTCCACCG) (1,AATAATTATAGTAATAATCAC) (1,CTGGGCCCAAGTGATTTCATC) (1,ATGCCCCCATCTTCCCAGAGC) (1,TGTTTGTTTTGAGATGGAGTC) (1,ACTACAGGCATGAGGCACCGC) (1,TGTTTTGAGATGGAGTCTCGC) (1,TTAAGGTTTTTTTTGTTTGTT) (1,ATCTCTACTGCCCAACAAGTC) (1,TCTTTCTTTCTTTCTTTTTCT) (1,TCCACGTTCTTCAGCTTTAGG) (1,GCCTCGGCCTCCCAAAGTGCT) (1,CAGGCATGAGGCACCGCGCCT) (1,GCTAATTTTTGTATTTTTGTA) (1,GTAACTAACATAATTGGCACT) (1,CCGATTTTATTTCCCATGTAA) (1,TGAGCATGAAAGTAATATATG) (1,GTGGAATTTAGGGAAACGATG) (1,AAGTCCCTACAAGATGGCATT) (1,ATAGTAATAATCACAGTGATG) (1,CTCATTCTCTCTCCTGCTGCA) (1,AAGATGGCATTTAAAAGCAGT) (1,ATCTTTATCACAGTTATTTAA) (1,TCCCAGAGCTGCCAGCCCTCA) (1,GGATTATTCTAGTGGAATTTA) (1,AACTTTGAATATCTTTATCAC) (1,AACGGCAAAGTGAGGGAGGAA) (1,TTGACTGTTCAATGCCTATAC) (1,CAAAGTGCTGGGACTACAGGC) (1,AGACTGGGTCTCACTATGTTG) (1,AGAGCTGCCAGCCCTCACAAT) (1,TTTCCTGCCCTCAAACATCAC) (1,CCAGGTCTTAGGTGCTGAGCA) (1,GGCCTCCCAAAGTGCTGGGAC) (1,TAATTATTCTCAACTTTCCGA) (1,CTGTAAATTCAAATTGGATAT) (1,AGAAGCTCCCACTAGGGCTGC) (1,GTCCCTACAAGATGGCATTTA) (1,TAAAGCCTTTAAAAAGCTTTA) (1,GTTGATTGTCCTATTTAATTA) (1,ATTGTCCTATTTAATTATTCT) (1,GCAAAGTGAGGGAGGAATAAT) (1,GGTCTTAGGTGCTGAGCAGTG) (1,CTTGCAGGTTGAGTAGGATTA) (1,CCCTACAAGATGGCATTTAAA) (1,ATAATCACAGTGATGACGTGG) (1,ACTGTGAAGAGGTGCCTGTTG) (1,ATGTTGATTGTCCTATTTAAT) (1,TTTATTTTTTGAGCATGAAAG) (1,ATGCCTATACAAGTAAAACTT) (1,CCCCTGCTGTATGTTTCCTGC) (1,TCTTTCTTTCTTTCTCTTTCT) (1,AAAGTGCTGGGACTACAGGCA) (1,TATAGTAATAATCACAGTGAT) (1,CTAATTTTTGTATTTTTGTAG) (1,TCTTATTTTTGGAAAAAAAAA) (1,TAACATAATTGGCACTGTCCC) (1,GAGTATAAGTTTCCTGAGGCC) (1,GCCAACAGCTAAATGTACCCA) (1,ACGGCAAAGTGAGGGAGGAAT) (1,GAATAATTATAGTAATAATCA) (1,TCTCTCCTGCTGCACTGTGAA) (1,AAATGTACCCAAGTGTTACTG) (1,TTTCCTGAGGCCTCCCAGGCC) (1,TTCATCTTATTTTTGGAAAAA) (1,AGTACTTGAGCTAAGCCTTGC) (1,AATGCCTATACAAGTAAAACT) (1,TGGCACTGTCCCTGTAAATTC) (1,TGTAGAAACGGGGTTTCACCA) (1,CCTAGAGAAGCTCCCACTAGG) (1,CTCAGTGTAAACAATTAGGTC) (1,ACCTCCTGGGCCCAAGTGATT) (1,TTCCCAGAGCTGCCAGCCCTC) (1,ATGGCTAACGGCAAAGTGAGG) (1,ACCATCATGCCCAGCTAATTT) (1,GTAATAATCACAGTGATGACG) (1,TCCCACTAGGGCTGCAGTCAA) (1,CACGTTCTTCAGCTTTAGGAC) (1,ATGCCAAGACAGGGGTGCCAA) (1,AGTGTAAACAATTAGGTCATT) (1,TTTTTGGAAAAAAAAACAAAC) (1,AAGTGAGGGAGGAATAATTAT) (1,CAGTGTAAACAATTAGGTCAT) (1,GCAGCCTCAACCTCCTGGGCC) (1,CTTTTTCTTTCTTTCTTTCTC) (1,AGGCACCGCGCCTGGCCAGGA) (1,TTTTGTAGAAACGGGGTTTCA) (1,ACATCACACTCCACGTTCTTC) (1,TGTGTAACTAACATAATTGGC) (1,GCCTGTTGCCAAGAGTATAAG) (1,TGGCTAACGGCAAAGTGAGGG) (1,TTTAAGGTTTTTTTTGTTTGT) (1,CTGAGCAGTGGGAGGTGGTGG) (1,CCCTCACAATGCCAACAGCTA) (1,TAATCCATACTATATTTGAGT) (1,GTGAGGGAGGAATAATTATAG) (1,GTTTAACTGATAATCCATACT) (1,GGCTCAAGTGATCCATCTCTG) (1,TATCCTCCCAAATTTTATTTA) (1,TTTCTTTCTTTCTTTTTCTTT) (1,TCGCTCCACCGCCCAGACTGG) (1,TGTTTCCTGCCCTCAAACATC) (1,GCCTCCCAAAGTGCTGGGACT) (1,ACTGTTCAATGCCTATACAAG) (1,TTGATTGTCCTATTTAATTAT) (1,TTGGCTTTCAGTTGTAACTTT) (1,CCAGAGCTGCCAGCCCTCACA) (1,TTATAAATATATTTAACAGGA) (1,TAAACAATTAGGTCATTATAA) (1,GGAGTCTCGCTCCACCGCCCA) (1,CTGCCCTCAAACATCACACTC) (1,TATGTTTCCTGCCCTCAAACA) (1,TATTTTTATTTTTAAGGTTTT) (1,GCCCAGCTAATTTTTGTATTT) (1,GATGGAGTCTCGCTCCACCGC) (1,TGTGCCAATGCCAAGACAGGG) (1,TCTCTACTGCCCAACAAGTCC) (1,GTTTCCTGCCCTCAAACATCA) (1,CTGTTGCCAAGAGTATAAGTT) (1,TGTGAAGAGGTGCCTGTTGCC) (1,CAGTCAATTCCCAGGTCTTAG) (1,AAGATAGTACTTGAGCTAAGC) (1,AGGGAGGAATAATTATAGTAA) (1,ATTATAAATATATTTAACAGG) (1,CTAGAGAAGCTCCCACTAGGG) (1,CAGCTCGTTTAACTGATAATC) (1,AAGGTTTTTTTTGTTTGTTTG) (1,TCAGGGTGAAACACCTGTGCC) (1,AGATGGCATTTAAAAGCAGTC) (1,CTACAAGATGGCATTTAAAAG) (1,CCTGCTGCACTGTGAAGAGGT) (1,CTGGGCTCAAGTGATCCATCT) (1,CATGAAAGTAATATATGCTCA) (1,CCTAGGCTGGTCTCAAACTCC) (1,GCATGCATATGGCTAACGGCA) (1,GGATATCCTCCCAAATTTTAT) (1,AGTCTCGCTCCACCGCCCAGA) (1,CAGTTATTTAAAGCCTTTAAA) (1,TGAAGAGGTGCCTGTTGCCAA) (1,CTTAGGTGCTGAGCAGTGGGA) (1,ATCCATACTATATTTGAGTAG) (1,TATTTTTGGAAAAAAAAACAA) (1,TTTCTTTCTCTTTCTTTCTTT) (1,CCTGGGCCCAAGTGATTTCAT) (1,ATTCTAGTGGAATTTAGGGAA) (1,TCCTGGGCCCAAGTGATTTCA) (1,CAAGTCCCTACAAGATGGCAT) (1,CTGTCCCTGTAAATTCAAATT) (1,TGATCCATCTCTGCCTTCCAA) (1,TGTTGCCTAGGCTGGTCTCAA) (1,TGCTGTATGTTTCCTGCCCTC) (1,CATGTAGCATGCATATGGCTA) (1,TAGGTCATTATAAATATATTT) (1,GTTGAGTAGGATTATTCTAGT) (1,GGGCCCAAGTGATTTCATCTT) (1,ACTCCTGGGCTCAAGTGATCC) (1,CCTGTGCCAATGCCAAGACAG) (1,TCTTTCTTTTTCTTTCTTTCT) (1,TTTGAGTAGGGCTGTCACATG) (1,CTGGTCTCAAACTCCTGGGCT) (1,TAAAACTTTACCAGCACCCAA) (1,GTAATATATGCTCAGTGTAAA) (1,TCAGGTGATCCACCCGCCTCG) (1,ACATAATTGGCACTGTCCCTG) (1,TAGTGGAATTTAGGGAAACGA) (1,CTGCTGTATGTTTCCTGCCCT) (1,TCAACTTTCCGATTTTATTTC) (1,AATTATTCTCAACTTTCCGAT) (1,GCTGGTCTCAAACTCCTGGGC) (1,GTAACTTTGAATATCTTTATC) (1,TGAAAGTAATATATGCTCAGT) (1,GGTCTCACTATGTTGCCTAGG) (1,AACAGCTAAATGTACCCAAGT) (1,CACCTGTGCCAATGCCAAGAC) (1,CCAGCCCTCACAATGCCAACA) (1,AGGTGCTGAGCAGTGGGAGGT) (1,TATTCCTATTTATAATCAGGG) (1,TTTAATTATTCTCAACTTTCC) (1,TAACTAACATAATTGGCACTG) (1,TTGAGATGGAGTCTCGCTCCA) (1,TTTTCTTTCTTTCTTTCTTTC) (1,CATGCCCAGCTAATTTTTGTA) (1,GAGCAGTGGGAGGTGGTGGCC) (1,CCTCGGCCTCCCAAAGTGCTG) (1,TCTTTCTTTCTCTTTCTTTCT) (1,TCAACCTCCTGGGCCCAAGTG) (1,TAGGCTGGTCTCAAACTCCTG) (1,TGCTCAGTGTAAACAATTAGG) (1,ATTGACTGTTCAATGCCTATA) (1,AGATAGTACTTGAGCTAAGCC) (1,GCCAATGCCAAGACAGGGGTG) (1,TCTTCAGCTTTAGGACTTGGA) (1,GGTTGAGTAGGATTATTCTAG) (1,TGCCAATGCCAAGACAGGGGT) (1,CTAAGCCTTGCAGGTTGAGTA) (1,TGGAGTCTCGCTCCACCGCCC) (1,GGAATTTAGGGAAACGATGTG) (1,GGTCTCAAACTCCTGGGCTCA) (1,GTTGCCAAGAGTATAAGTTTC) (1,GAGTCTCGCTCCACCGCCCAG) (1,AAACATCACACTCCACGTTCT) (1,CTAGTGGAATTTAGGGAAACG) (1,AGGTGATCCACCCGCCTCGGC) (1,TAAACAGCTCGTTTAACTGAT) (1,TTATAATCAGGGTGAAACACC) (1,TGCCAACAGCTAAATGTACCC) (1,GTTTTTTTTGTTTGTTTGTTT) (1,CCTATTTATAATCAGGGTGAA) (1,GGCTTTCAGTTGTAACTTTGA) (1,TTTCTTTCTTTTTCTTTCTTT) (1,ATGTTTCCTGCCCTCAAACAT) (1,TTATTTTTGGAAAAAAAAACA) (1,CTTGCAAGAATCTCTACTGCC) (1,CTCCTGGGCTCAAGTGATCCA) (1,CTATGTTGCCTAGGCTGGTCT) (1,GCATATGGCTAACGGCAAAGT) (1,AGAGCAGCCTTATTCCTATTT) (1,CTTTCTTTTTCTTTCTTTCTT) (1,CCTTATTCCTATTTATAATCA) (1,CCTCAACCTCCTGGGCCCAAG) (1,TAATCAGGGTGAAACACCTGT) (1,CCATACTATATTTGAGTAGGG) (1,TTTATTTTTAAGGTTTTTTTT) (1,ACTATGTTGCCTAGGCTGGTC) (1,CAGGTTGAGTAGGATTATTCT) (1,TAATCACAGTGATGACGTGGA) (1,TACTGCCCAACAAGTCCCTAC) (1,CTGGGTCTCACTATGTTGCCT) (1,ATCACACTCCACGTTCTTCAG) (1,GACTGTTCAATGCCTATACAA) (1,CTTCCCAGAGCTGCCAGCCCT) (1,CTCGCTCCACCGCCCAGACTG) (1,TTTGTTTGTTTGTTTTGAGAT) (1,CTCTCTCCTGCTGCACTGTGA) (1,TATCTTTATCACAGTTATTTA) (1,AGGTCTTAGGTGCTGAGCAGT) (1,TTCTTTCTTTTTCTTTCTTTC) (1,CCAAGTCAAAAAGAAAAAAAA) (1,TAAGAGCAGCCTTATTCCTAT) (1,CAATGCCAAGACAGGGGTGCC) (1,GCACTGTCCCTGTAAATTCAA) (1,TACAAGATGGCATTTAAAAGC) (1,TAACGGCAAAGTGAGGGAGGA) (1,GGGTCTCACTATGTTGCCTAG) (1,AGGTGCCTGTTGCCAAGAGTA) (1,TAAATGTACCCAAGTGTTACT) (1,GGCTGGTCTCAAACTCCTGGG) (1,TGCCCTCAAACATCACACTCC) (1,TCATTGACTGTTCAATGCCTA) (1,TCATGTAGCATGCATATGGCT) (1,ACCTGTGCCAATGCCAAGACA) (1,ATACAAGTAAAACTTTACCAG) (1,TGTCACATGGTTGGAACCTCC) (1,CTAGGCTGGTCTCAAACTCCT) (1,AGTATAAGTTTCCTGAGGCCT) (1,GTCTCAAACTCCTGGGCTCAA) (1,ATTTTTGTATTTTTGTAGAAA) (1,ATTGGCACTGTCCCTGTAAAT) (1,TTTTCTTTCTTTCTTTCTCTT) (1,GCCAAGACAGGGGTGCCAAGA) (1,TGCCAAGACAGGGGTGCCAAG) (1,CCAACAAGTCCCTACAAGATG) (1,TTCTTTCTTTCTTTCTCTTTC) (1,TATGTTGCCTAGGCTGGTCTC) (1,TCATTATAAATATATTTAACA) (1,AGTTTCCTGAGGCCTCCCAGG) (1,CTCAACTTTCCGATTTTATTT) (1,GCTGCACTGTGAAGAGGTGCC) (1,GTCTCACTATGTTGCCTAGGC) (1,TATAAATATATTTAACAGGAA) (1,AATTAAACAGCTCGTTTAACT) (1,ACTTTGAATATCTTTATCACA) (1,CCAAAGTGCTGGGACTACAGG) (1,TAGAGAAGCTCCCACTAGGGC) (1,CGCCTCGGCCTCCCAAAGTGC) (1,CAGCACCCAAGTCAAAAAGAA) (1,AATCTCTACTGCCCAACAAGT) (1,AATAAAGTTTGGCTTTCAGTT) (1,TTTTTGTAGAAACGGGGTTTC) (1,TATTTGAGTAGGGCTGTCACA) (1,GCTAAGCCTTGCAGGTTGAGT) (1,AGAATCTCTACTGCCCAACAA) (1,TAGAAACGGGGTTTCACCATG) (1,TTTTTTTGTTTGTTTGTTTTG) (1,CACACTCCACGTTCTTCAGCT) (1,GCTGGGACTACAGGCATGAGG) (1,TCTTTCTCTTTCTTTCTTTCT) (1,CAGCTAATTTTTGTATTTTTG) (1,GTGAAGAGGTGCCTGTTGCCA) (1,TTCCCATGTAACAGTGTTGTT) (1,AATATCTTTATCACAGTTATT) (1,AAACAATTAGGTCATTATAAA) (1,ATGAAAGTAATATATGCTCAG) (1,GAGCTAAGCCTTGCAGGTTGA) (1,TTAAAGCCTTTAAAAAGCTTT) (1,AAAAAAAACAAACTAAACCAA) (1,CACTAGGGCTGCAGTCAATTC) (1,TTTTTAAGGTTTTTTTTGTTT) (1,AATTTAGGGAAACGATGTGCA) (1,ATAATCCATACTATATTTGAG) (1,TGAGTAGGATTATTCTAGTGG) (1,GGACTACAGGCATGAGGCACC) (1,GAGGCACCGCGCCTGGCCAGG) (1,AATGCCAACAGCTAAATGTAC) (1,AATAAATGTTGATTGTCCTAT) (1,CGGGGTTTCACCATGTTGGCC) (1,TTATTTTTTGAGCATGAAAGT) (1,TTTAAAGCCTTTAAAAAGCTT) (1,TTTTTGAGCATGAAAGTAATA) (1,TATTCTAGTGGAATTTAGGGA) (1,AAGTGATCCATCTCTGCCTTC) (1,CTCTCATTGACTGTTCAATGC) (1,AAACACCTGTGCCAATGCCAA) (1,GCCCTCAAACATCACACTCCA) (1,CTCACTATGTTGCCTAGGCTG) (1,AGGTCATTATAAATATATTTA) (1,AGTAGGATTATTCTAGTGGAA) (1,TTGTAACTTTGAATATCTTTA) (1,AAGTTTGGCTTTCAGTTGTAA) (1,CCCATGTAACAGTGTTGTTTT) (1,TTTATTTCCCATGTAACAGTG) (1,TGATAATCCATACTATATTTG) (1,TGTAACTAACATAATTGGCAC) (1,TGCCCAACAAGTCCCTACAAG) (1,AGTGAGGGAGGAATAATTATA) (1,ATTATTCTCAACTTTCCGATT) (1,TTTTTATTTTTAAGGTTTTTT) (1,CTATTTATAATCAGGGTGAAA) (1,TGCCAAGAGTATAAGTTTCCT) (1,TATTCTCAACTTTCCGATTTT) (1,TATTTTTTGAGCATGAAAGTA) (1,AGCTAAGCCTTGCAGGTTGAG) (1,CAACAAGTCCCTACAAGATGG) (1,CCTCCCAAAGTGCTGGGACTA) (1,CCTGCTGTATGTTTCCTGCCC) (1,TTTCACCATGTTGGCCCAGCT) (1,AAGCCTTTAAAAAGCTTTAAT) (1,AATTAGGTCATTATAAATATA) (1,GGGTTTCACCATGTTGGCCCA) (1,GCTCAGTGTAAACAATTAGGT) (1,CATCTTCCCAGAGCTGCCAGC) (1,GGTGAAACACCTGTGCCAATG) (1,CTGCAGTCAATTCCCAGGTCT) (1,ATCCTCCCAAATTTTATTTAA) (1,ATTTTTGGAAAAAAAAACAAA) (1,CCTATACAAGTAAAACTTTAC) (1,TAACTGATAATCCATACTATA) (1,TCGTTTAACTGATAATCCATA) (1,TTTTTCTTTCTTTCTTTCTCT) (1,CTGTGCCAATGCCAAGACAGG) (1,TAAATTCAAATTGGATATCCT) (1,ATGAGGCACCGCGCCTGGCCA) (1,TGTTGCCAAGAGTATAAGTTT) (1,CCCATCTTCCCAGAGCTGCCA) (1,TTATCACAGTTATTTAAAGCC) (1,GGGGTTTCACCATGTTGGCCC) (1,AGGCTGGTCTCAAACTCCTGG) (1,TGAGCTAAGCCTTGCAGGTTG) (1,TTCTTCAGCTTTAGGACTTGG) (1,AGTAATATATGCTCAGTGTAA) (1,TTGAGCTAAGCCTTGCAGGTT) (1,TACTCTCATTGACTGTTCAAT) (1,GGAATAATTATAGTAATAATC) (1,GGTTTTTTTTGTTTGTTTGTT) (1,AGGTTGAGTAGGATTATTCTA) (1,AGCTGCCAGCCCTCACAATGC) (1,ATGCCCAGCTAATTTTTGTAT) (1,TTTTGGAAAAAAAAACAAACT) (1,CTTTCTCTTTCTTTCTTTCTT) (1,TGCCCCCATCTTCCCAGAGCT) (1,CAAGAATCTCTACTGCCCAAC) (1,TGGGCCCAAGTGATTTCATCT) (1,GGTGCTGAGCAGTGGGAGGTG) (1,TTATTTAAAGCCTTTAAAAAG) (1,CTCCTGCTGCACTGTGAAGAG) (1,CCTCCCCTGCTGTATGTTTCC) (1,AGTGCTGGGACTACAGGCATG) (1,TGATCCACCCGCCTCGGCCTC) (1,TTCCGATTTTATTTCCCATGT) (1,AGTAGGGCTGTCACATGGTTG) (1,TCTTTCTTTCTTTTTCTTTCT) (1,ATTAAACAGCTCGTTTAACTG) (1,AGCTCGTTTAACTGATAATCC) (1,CTCTACTGCCCAACAAGTCCC) (1,GAGGTGCCTGTTGCCAAGAGT) (1,TCTCTCTCCTGCTGCACTGTG) (1,GGTTTCACCATGTTGGCCCAG) (1,CACTATGTTGCCTAGGCTGGT) (1,CAACTTTCCGATTTTATTTCC) (1,GCACCGCGCCTGGCCAGGACT) (1,CTCCTGGGCCCAAGTGATTTC) (1,TTCTCTCTCCTGCTGCACTGT) (1,ATAAGTTTCCTGAGGCCTCCC) (1,GCTAAATGTACCCAAGTGTTA) (1,CCAATGCCAAGACAGGGGTGC) (1,GGCAAAGTGAGGGAGGAATAA) (1,AGTTTGGCTTTCAGTTGTAAC) (1,CTATACAAGTAAAACTTTACC) (1,TTTTAAGGTTTTTTTTGTTTG) (1,TATACAAGTAAAACTTTACCA) (1,TTGAGTAGGATTATTCTAGTG) (1,CACATGGTTGGAACCTCCGGT) (1,AACCTCCTGGGCCCAAGTGAT) (1,TAAATGTTGATTGTCCTATTT) (1,GCTTTCAGTTGTAACTTTGAA) (1,GGCTAACGGCAAAGTGAGGGA) (1,ATTTAAAGCCTTTAAAAAGCT) (1,CTTTCTTTCTTTCTTTCTTTT) (1,TTATTTTTATTTTTAAGGTTT) (1,GTACTTGAGCTAAGCCTTGCA) (1,ACCAGCACCCAAGTCAAAAAG) (1,GCCAGCCCTCACAATGCCAAC) (1,ACAGGCATGAGGCACCGCGCC) (1,AATAATCACAGTGATGACGTG) (1,AACTCCTGGGCTCAAGTGATC) (1,TCTCTTTCTTTCTTTCTTTCT) (1,CTTTCTTTCTTTTTCTTTCTT) (1,AAGTTTCCTGAGGCCTCCCAG) (1,GAATTTAGGGAAACGATGTGC) (1,CAAGTGATTTCATCTTATTTT) (1,AGCTCCCACTAGGGCTGCAGT) (1,GGAAAAAAAAACAAACTAAAC) (1,TATAAGAGCAGCCTTATTCCT) (1,AAGTGATTTCATCTTATTTTT) (1,CGTTCTTCAGCTTTAGGACTT) (1,TCTACTGCCCAACAAGTCCCT) (1,TTTTTTGAGCATGAAAGTAAT) (1,ATACTATATTTGAGTAGGGCT) (1,AAGTGCTGGGACTACAGGCAT) (1,ATAATCAGGGTGAAACACCTG) (1,TGCAAGAATCTCTACTGCCCA) (1,TTCTTTCTTTCTCTTTCTTTC) (1,GTAGGATTATTCTAGTGGAAT) (1,ATTTTTATTTTTAAGGTTTTT) (1,TGTTCAATGCCTATACAAGTA) (1,ACTGCCCAACAAGTCCCTACA) (1,TTATTTCCCATGTAACAGTGT) (1,GCAGTGGGAGGTGGTGGCCAT) (1,GTTGTAACTTTGAATATCTTT) (1,TAGGGCTGTCACATGGTTGGA) (1,ATATGCTCAGTGTAAACAATT) (1,TTTAACTGATAATCCATACTA) (1,TTCTTTCTTTCTTTCTTTTTC) (1,CAAGAGTATAAGTTTCCTGAG) (1,CTTATTTTTGGAAAAAAAAAC) (1,TTCCTCCCCTGCTGTATGTTT) (1,TTTAATAAATGTTGATTGTCC) (1,TGAATATCTTTATCACAGTTA) (1,CCCAGGTCTTAGGTGCTGAGC) (1,CATATGGCTAACGGCAAAGTG) (1,GCTAACGGCAAAGTGAGGGAG) (1,ACTAGGGCTGCAGTCAATTCC) (1,CTTTTTCTTTCTTTCTTTCTT) (1,TTATTTTTAAGGTTTTTTTTG) (1,GTGCCAATGCCAAGACAGGGG) (1,GCCTAGGCTGGTCTCAAACTC) (1,GCCCAACAAGTCCCTACAAGA) (1,AGCCTTATTCCTATTTATAAT) (1,CTACAGGCATGAGGCACCGCG) (1,TTGCAGGTTGAGTAGGATTAT) (1,TCAAATTGGATATCCTCCCAA) (1,CCTCCCAAATTTTATTTAAGC) (1,CCCCATCTTCCCAGAGCTGCC) (1,CGGCCTCCCAAAGTGCTGGGA) (1,ACAATGCCAACAGCTAAATGT) (1,TTGTAGAAACGGGGTTTCACC) (1,CAGCTAAATGTACCCAAGTGT) (1,GCACTGTGAAGAGGTGCCTGT) (1,ATAGTACTTGAGCTAAGCCTT) (1,TCTAGTGGAATTTAGGGAAAC) (1,AAAGTGAGGGAGGAATAATTA) (1,TAATTATAGTAATAATCACAG) (1,CCAAGTGATTTCATCTTATTT) (1,TAACTTTGAATATCTTTATCA) (1,AAATTGGATATCCTCCCAAAT) (1,TTCCCAGGTCTTAGGTGCTGA) (1,CTTTATCACAGTTATTTAAAG) (1,TCCCAAAGTGCTGGGACTACA) (1,ATCTTATTTTTGGAAAAAAAA) (1,TCTCCTGCTGCACTGTGAAGA) (1,GTCTCGCTCCACCGCCCAGAC) (1,CTACTCTCATTGACTGTTCAA) (1,TTTTTCTTTCTTTCTTTCTTT) (1,TTTGTTTGTTTTGAGATGGAG) (1,CCACTAGGGCTGCAGTCAATT) (1,TCAGTGTAAACAATTAGGTCA) (1,GTCTTAGGTGCTGAGCAGTGG) (1,TGTATGTTTCCTGCCCTCAAA) (1,TATTTCCCATGTAACAGTGTT) (1,CCTGTTGCCAAGAGTATAAGT) (1,ATGTTGCCTAGGCTGGTCTCA) (1,AGCAGCCTTATTCCTATTTAT) (1,AGAGGTGCCTGTTGCCAAGAG) (1,CATCTTATTTTTGGAAAAAAA) (1,TCTTCCTCCCCTGCTGTATGT) (1,CCTTGCAAGAATCTCTACTGC) (1,CTTTCTTTCTTTCTCTTTCTT) (1,CACAATGCCAACAGCTAAATG) (1,ATTTATAATCAGGGTGAAACA) (1,CTTTGAATATCTTTATCACAG) (1,TCTCAACTTTCCGATTTTATT) (1,TGTTTGTTTGTTTTGAGATGG) (1,ACTCCACGTTCTTCAGCTTTA) (1,GACTACAGGCATGAGGCACCG) (1,GTTCAATGCCTATACAAGTAA) (1,AGCTAATTTTTGTATTTTTGT) (1,CTGCACTGTGAAGAGGTGCCT) (1,CTCACTGCAGCCTCAACCTCC) (1,CTCAGGTGATCCACCCGCCTC) (1,TCCTGCTGCACTGTGAAGAGG) (1,GCTCCACCGCCCAGACTGGAG) (1,AAAAAAAAACAAACTAAACCA) (1,TGCCTATACAAGTAAAACTTT) (1,TTTTTGTTTGTTTGTTTTGAG) (1,TTTTATTTCCCATGTAACAGT) (1,AAGAATCTCTACTGCCCAACA) (1,GAATATCTTTATCACAGTTAT) (1,TGGCTTTCAGTTGTAACTTTG) (1,AGTGATCCATCTCTGCCTTCC) (1,GGCTGCAGTCAATTCCCAGGT) (1,AAATTAAACAGCTCGTTTAAC) (1,CCCAACAAGTCCCTACAAGAT) (1,ATATTTGAGTAGGGCTGTCAC) (1,TTATAGTAATAATCACAGTGA) (1,CTGTGAAGAGGTGCCTGTTGC) (1,TTCCTATTTATAATCAGGGTG) (1,GGCACTGTCCCTGTAAATTCA) (1,CTCAACCTCCTGGGCCCAAGT) (1,TGCTGAGCAGTGGGAGGTGGT) (1,CCCCCATCTTCCCAGAGCTGC) (1,TAAGCCTTGCAGGTTGAGTAG) (1,TATGGCTAACGGCAAAGTGAG) (1,ACACTCCACGTTCTTCAGCTT) (1,ACTTTACCAGCACCCAAGTCA) (1,TGGTCTCAAACTCCTGGGCTC) (1,CAAATTTTATTTAAGCAATTG) (1,AAGCCTTGCAGGTTGAGTAGG) (1,GTGCTGAGCAGTGGGAGGTGG) (1,AAAGCCTTTAAAAAGCTTTAA) (1,CGATTTTATTTCCCATGTAAC) (1,TTTTGAGATGGAGTCTCGCTC) (1,TCACAGTTATTTAAAGCCTTT) (1,CTAACGGCAAAGTGAGGGAGG) (1,ATTCAAATTGGATATCCTCCC) (1,GGCACCGCGCCTGGCCAGGAC) (1,ATTTTTAAGGTTTTTTTTGTT) (1,GTGAAACACCTGTGCCAATGC) (1,TTTCTTTCTTTCTTTCTTTTT) (1,GATATCCTCCCAAATTTTATT) (1,ACTGTCCCTGTAAATTCAAAT) (1,GAAACGGGGTTTCACCATGTT) (1,TTTCCGATTTTATTTCCCATG) (1,TCCATACTATATTTGAGTAGG) (1,TCTTTATCACAGTTATTTAAA) (1,TTCCTGCCCTCAAACATCACA) (1,TGCATATGGCTAACGGCAAAG) (1,TAGTAATAATCACAGTGATGA) (1,CGCTCCACCGCCCAGACTGGA) (1,CCCAAGTCAAAAAGAAAAAAA) (1,CTGCCCAACAAGTCCCTACAA) (1,ATTTTATTTCCCATGTAACAG) (1,GAGTAGGGCTGTCACATGGTT) (1,CAAGATAGTACTTGAGCTAAG) (1,CCTCCTGGGCCCAAGTGATTT) (1,CAGCCTCAACCTCCTGGGCCC) (1,CCACCATCATGCCCAGCTAAT) (1,CTGCCAGCCCTCACAATGCCA) (1,TCCCTGTAAATTCAAATTGGA) (1,CAATTCCCAGGTCTTAGGTGC) (1,CAAATTGGATATCCTCCCAAA) (1,TTGGCACTGTCCCTGTAAATT) (1,CACTGTCCCTGTAAATTCAAA) (1,CTAACATAATTGGCACTGTCC) (1,GAGGGAGGAATAATTATAGTA) (1,TCCCAAATTTTATTTAAGCAA) (1,TTGAATATCTTTATCACAGTT) (1,CAAGATGGCATTTAAAAGCAG) (1,CCCTGTAAATTCAAATTGGAT) (1,GTATCTTCCTCCCCTGCTGTA) (1,CTCGGCCTCCCAAAGTGCTGG) (1,AATCCATACTATATTTGAGTA) (1,CCTGGGCTCAAGTGATCCATC) (1,CAGGCACCCACCATCATGCCC) (1,ACATGGTTGGAACCTCCGGTT) (1,TACAAGTAAAACTTTACCAGC) (1,CACAGTTATTTAAAGCCTTTA) (1,AGTCCCTACAAGATGGCATTT) (1,CATGCATATGGCTAACGGCAA) (1,CCAAGAGTATAAGTTTCCTGA) (1,GCATTTAAAAGCAGTCCCTCA) (1,CAGGGTGAAACACCTGTGCCA) (1,CTTGAGCTAAGCCTTGCAGGT) (1,ATAATTATAGTAATAATCACA) (1,GCTCAGGCCTTGCAAGAATCT) (1,ATCCATCTCTGCCTTCCAAAG) (1,AGTCAATTCCCAGGTCTTAGG) (1,CTTCCTCCCCTGCTGTATGTT) (1,ATTATTCTAGTGGAATTTAGG) (1,GAAAGTAATATATGCTCAGTG) (1,AGCCCTCACAATGCCAACAGC) (1,CTCAAACTCCTGGGCTCAAGT) (1,TTCTAGTGGAATTTAGGGAAA) (1,CTCTTTCTTTCTTTCTTTCTT) (1,AACACCTGTGCCAATGCCAAG) (1,CTGTTCAATGCCTATACAAGT) (1,AGTCAAAAAGAAAAAAAAGGG) (1,GATAGTACTTGAGCTAAGCCT) (1,CACTGTGAAGAGGTGCCTGTT) (1,TTTGTTTTGAGATGGAGTCTC) (1,ACCCGCCTCGGCCTCCCAAAG) (1,TAATAAATGTTGATTGTCCTA) (1,AGGTTTTTTTTGTTTGTTTGT) (1,TGAAACACCTGTGCCAATGCC) (1,AGCATGAAAGTAATATATGCT) (1,TTATTCTAGTGGAATTTAGGG) (1,GGGACTACAGGCATGAGGCAC) (1,TTGTTTTGAGATGGAGTCTCG) (1,AGCACCCAAGTCAAAAAGAAA) (1,GCACCCACCATCATGCCCAGC) (1,GATCCATCTCTGCCTTCCAAA) (1,TTGAGTAGGGCTGTCACATGG) (1,AATTTTTGTATTTTTGTAGAA) (1,TACTTGAGCTAAGCCTTGCAG) (1,TCATTCTCTCTCCTGCTGCAC) (1,CAAGTGATCCATCTCTGCCTT) (1,AGGCACCCACCATCATGCCCA) (1,CTAGGGCTGCAGTCAATTCCC) (1,AGGGCTGTCACATGGTTGGAA) (1,AGTAAAACTTTACCAGCACCC) (1,ACTATATTTGAGTAGGGCTGT) (1,AGGCCTTGCAAGAATCTCTAC) (1,TGTCCTATTTAATTATTCTCA) (1,GTGTAAACAATTAGGTCATTA) (1,AACAGCTCGTTTAACTGATAA) (1,GAGTAGGATTATTCTAGTGGA) (1,GTTTCACCATGTTGGCCCAGC) (1,TTGCCTAGGCTGGTCTCAAAC) (1,GTCACATGGTTGGAACCTCCG) (1,GAGCTGCCAGCCCTCACAATG) (1,GGTGCCTGTTGCCAAGAGTAT) (1,TTTCAGTTGTAACTTTGAATA) (1,GTTTGTTTTGAGATGGAGTCT) (1,TAATATATGCTCAGTGTAAAC) (1,AACTTTACCAGCACCCAAGTC) (1,AGGATTATTCTAGTGGAATTT) (1,AAAGTAATATATGCTCAGTGT) (1,GTGATCCACCCGCCTCGGCCT) (1,ATATGGCTAACGGCAAAGTGA) (1,CCCAAAGTGCTGGGACTACAG) (1,TGGATATCCTCCCAAATTTTA) (1,TCAATGCCTATACAAGTAAAA) (1,TTTGAGATGGAGTCTCGCTCC) (1,GTCCCTGTAAATTCAAATTGG) (1,CTAAATGTACCCAAGTGTTAC) (1,TTCTCTTTCTTTCTTTCTTTC) (1,TGTATTTTTGTAGAAACGGGG) (1,AAGTAAAACTTTACCAGCACC) (1,ACTTTCCGATTTTATTTCCCA) (1,TTTCTCTTTCTTTCTTTCTTT) (1,ATGCCAACAGCTAAATGTACC) (1,CAGGTGATCCACCCGCCTCGG) (1,TAATAATCACAGTGATGACGT) (1,TCAGGCCTTGCAAGAATCTCT) (1,CTTTCTTTCTCTTTCTTTCTT) (1,AATTATAGTAATAATCACAGT) (1,TCACTATGTTGCCTAGGCTGG) (1,TGAGTAGGGCTGTCACATGGT) (1,CCAGCTAATTTTTGTATTTTT) (1,TATTTATAATCAGGGTGAAAC) (1,CTATTTAATTATTCTCAACTT) (1,TATTTTTGTAGAAACGGGGTT) (1,TAGGATTATTCTAGTGGAATT) (1,AAATGTTGATTGTCCTATTTA) (1,TATATTTGAGTAGGGCTGTCA) (1,AACATAATTGGCACTGTCCCT) (1,CTCAAACATCACACTCCACGT) (1,TACTATATTTGAGTAGGGCTG) (1,GCATGAAAGTAATATATGCTC) (1,GCTGTATGTTTCCTGCCCTCA) (1,AACAATTAGGTCATTATAAAT) (1,TTCTTTCTTTCTTTTTCTTTC) (1,TTAGGTGCTGAGCAGTGGGAG) (1,CAGCCCTCACAATGCCAACAG) (1,TTTGGCTTTCAGTTGTAACTT) (1,ATTTTTTGAGCATGAAAGTAA) (1,TGTAACTTTGAATATCTTTAT) (1,TCCCTACAAGATGGCATTTAA) (1,ATATCTTTATCACAGTTATTT) (1,ATCATGCCCAGCTAATTTTTG) (1,GCTGAGCAGTGGGAGGTGGTG) (1,CTGATAATCCATACTATATTT) (1,ATGGCATTTAAAAGCAGTCCC) (1,TTTGAGCATGAAAGTAATATA) (1,TCACATGGTTGGAACCTCCGG) (1,GTGTAACTAACATAATTGGCA) (1,AACATCACACTCCACGTTCTT) (1,TTTGGAAAAAAAAACAAACTA) (1,TCCGATTTTATTTCCCATGTA) (1,TTGTTTGTTTGTTTTGAGATG) (1,CATCATGCCCAGCTAATTTTT) (1,TGAGGCACCGCGCCTGGCCAG) (1,GAAAAAAAAACAAACTAAACC) (1,AAAAAAACAAACTAAACCAAA) (1,AATCAGGGTGAAACACCTGTG) (1,GCCCCCATCTTCCCAGAGCTG) (1,GTAAAACTTTACCAGCACCCA) (1,ATCAGGGTGAAACACCTGTGC) (1,TGTAAACAATTAGGTCATTAT) (1,CATCACACTCCACGTTCTTCA) (1,ACAAGTAAAACTTTACCAGCA) (1,CCCAGAGCTGCCAGCCCTCAC) (1,GTAGGGCTGTCACATGGTTGG) (1,GAAACACCTGTGCCAATGCCA) (1,AATATATGCTCAGTGTAAACA) (1,GCTCAAGTGATCCATCTCTGC) (1,ATTTAAAAGCAGTCCCTCACG) (1,TGGAAAAAAAAACAAACTAAA) (1,AATGCCAAGACAGGGGTGCCA) (1,TCCTATTTAATTATTCTCAAC) (1,GCATGAGGCACCGCGCCTGGC) (1,AACGGGGTTTCACCATGTTGG) (1,TGGGACTACAGGCATGAGGCA) (1,CCTGCCCTCAAACATCACACT) (1,TAAAGTTTGGCTTTCAGTTGT) (1,ACAGCTAAATGTACCCAAGTG) (1,TCTTCCCAGAGCTGCCAGCCC) (1,GCTGTCACATGGTTGGAACCT) (1,GATGGCATTTAAAAGCAGTCC) (1,ATGGAGTCTCGCTCCACCGCC) (1,ATCACAGTTATTTAAAGCCTT) (1,GATTTTATTTCCCATGTAACA) (1,GTATAAGAGCAGCCTTATTCC) (1,AATTCCCAGGTCTTAGGTGCT) (1,TGACTGTTCAATGCCTATACA) (1,ATTCTCAACTTTCCGATTTTA) (1,CACCCGCCTCGGCCTCCCAAA) (1,TTAGGTCATTATAAATATATT) (1,GGTGATCCACCCGCCTCGGCC) (1,AGCCTCAACCTCCTGGGCCCA) (1,AAGAGGTGCCTGTTGCCAAGA) (1,GGCCCAAGTGATTTCATCTTA) (1,GCCCTCACAATGCCAACAGCT) (1,CATTTAAAAGCAGTCCCTCAC) (1,AATGTACCCAAGTGTTACTGA) (1,GTCATTATAAATATATTTAAC) (1,AGTGGAATTTAGGGAAACGAT) (1,CAGGTCTTAGGTGCTGAGCAG) (1,AGAGTATAAGTTTCCTGAGGC) (1,GTTTGGCTTTCAGTTGTAACT) (1,AAGAGCAGCCTTATTCCTATT) (1,ACAGCTCGTTTAACTGATAAT) (1,GGCTGTCACATGGTTGGAACC) (1,TGCAGCCTCAACCTCCTGGGC) (1,ATAAATATATTTAACAGGAAT) (1,CTTTCAGTTGTAACTTTGAAT) (1,ATGCTCAGTGTAAACAATTAG) (1,GTCAATTCCCAGGTCTTAGGT) (1,ACGTTCTTCAGCTTTAGGACT) (1,ACAAGTCCCTACAAGATGGCA) (1,GTTGCCTAGGCTGGTCTCAAA) (1,TCATCTTATTTTTGGAAAAAA) (1,ACTAACATAATTGGCACTGTC) (1,AGCTAAATGTACCCAAGTGTT) (1,CAGAGCTGCCAGCCCTCACAA) (1,ACAGGCACCCACCATCATGCC) (1,AGCCTTGCAGGTTGAGTAGGA) (1,ATTTAATTATTCTCAACTTTC) (1,TTGCAAGAATCTCTACTGCCC) (1,TTCTCAACTTTCCGATTTTAT) (1,CTCGTTTAACTGATAATCCAT) (1,AAAATGCCCCCATCTTCCCAG) (1,TATTTAATTATTCTCAACTTT) (1,GAATCTCTACTGCCCAACAAG) (1,TCCTGGGCTCAAGTGATCCAT) (1,TGCACTGTGAAGAGGTGCCTG) (1,CTATATTTGAGTAGGGCTGTC) (1,GCAGTCAATTCCCAGGTCTTA) (1,TGCCTAGGCTGGTCTCAAACT) (1,CAAGTAAAACTTTACCAGCAC) (1,GGCATGAGGCACCGCGCCTGG) (1,CAGTTGTAACTTTGAATATCT) (1,TTTATCACAGTTATTTAAAGC) (1,TCACACTCCACGTTCTTCAGC) (1,CATACTATATTTGAGTAGGGC) (1,TCATGCCCAGCTAATTTTTGT) (1,AAGTAATATATGCTCAGTGTA) (1,CTGCTGCACTGTGAAGAGGTG) (1,ACCCACCATCATGCCCAGCTA) (1,TTGAGCATGAAAGTAATATAT) (1,AAATTCAAATTGGATATCCTC) (1,TTAACTGATAATCCATACTAT) (1,ATAATTGGCACTGTCCCTGTA) (1,GTCCTATTTAATTATTCTCAA) (1,TCAGTTGTAACTTTGAATATC) (1,TTGTTTGTTTTGAGATGGAGT) (1,CTCCCCTGCTGTATGTTTCCT) (1,CGTTTAACTGATAATCCATAC) (1,ATCCACCCGCCTCGGCCTCCC) (1,GCTCCCACTAGGGCTGCAGTC) (1,CCTACAAGATGGCATTTAAAA) (1,GGGCTGTCACATGGTTGGAAC) (1,ATTTTTGTAGAAACGGGGTTT) (1,TTTGAATATCTTTATCACAGT) (1,TTGCCAAGAGTATAAGTTTCC) (1,AGTAATAATCACAGTGATGAC) (1,GCCAAGAGTATAAGTTTCCTG) (1,TAAAAGCAGTCCCTCACGCAC) (1,GTGATTTCATCTTATTTTTGG) (1,CACTGCAGCCTCAACCTCCTG) (1,CTCAGGCCTTGCAAGAATCTC) (1,GCTCGTTTAACTGATAATCCA) (1,CTTTCCGATTTTATTTCCCAT) (1,ACAAGATGGCATTTAAAAGCA) (1,TTGGAAAAAAAAACAAACTAA) (1,CCACCCGCCTCGGCCTCCCAA) (1,TCAAGTGATCCATCTCTGCCT) (1,CTGTCACATGGTTGGAACCTC) (1,CTTTACCAGCACCCAAGTCAA) (1,GATAATCCATACTATATTTGA) (1,CATAATTGGCACTGTCCCTGT) (1,AATTGGATATCCTCCCAAATT) (1,AGTGATTTCATCTTATTTTTG) (1,CCAACAGCTAAATGTACCCAA) (1,CATTGACTGTTCAATGCCTAT) (1,AGCAGTGGGAGGTGGTGGCCA) (1,CCCAGCTAATTTTTGTATTTT) (1,TGCCTGTTGCCAAGAGTATAA) (1,CCTCACAATGCCAACAGCTAA) (1,AGAAACGGGGTTTCACCATGT) (1,TGCTGGGACTACAGGCATGAG) (1,TAAGTTTCCTGAGGCCTCCCA) (1,GATTTCATCTTATTTTTGGAA) (1,GGCACCCACCATCATGCCCAG) (1,TATAAGTTTCCTGAGGCCTCC) (1,TAATTTTTGTATTTTTGTAGA) (1,TAAAATGCCCCCATCTTCCCA) (1,AATTCAAATTGGATATCCTCC) (1,AAACGGGGTTTCACCATGTTG) (1,TGTCCCTGTAAATTCAAATTG) (1,GAGCATGAAAGTAATATATGC) (1,CGGCAAAGTGAGGGAGGAATA) (1,TTAAACAGCTCGTTTAACTGA) (1,TATGCTCAGTGTAAACAATTA) (1,TTTCCCATGTAACAGTGTTGT) (1,GAAGAGGTGCCTGTTGCCAAG) (1,GGTCATTATAAATATATTTAA) (1,TCTTAGGTGCTGAGCAGTGGG) (1,TAGGGCTGCAGTCAATTCCCA) (1,ACTGCAGCCTCAACCTCCTGG) (1,TGCTGCACTGTGAAGAGGTGC) (1,CTGGGACTACAGGCATGAGGC) (1,GTATTTTTGTAGAAACGGGGT) (1,CCCAAGTGATTTCATCTTATT) (1,GCTGCAGTCAATTCCCAGGTC) (1,TTGTATTTTTGTAGAAACGGG) (1,AGTTATTTAAAGCCTTTAAAA) (1,AGAGAAGCTCCCACTAGGGCT) (1,TTATTCCTATTTATAATCAGG) (1,CAACCTCCTGGGCCCAAGTGA) (1,CACCATCATGCCCAGCTAATT) (1,AGCATGCATATGGCTAACGGC) (1,GCCTCAACCTCCTGGGCCCAA) (1,ATTCTCTCTCCTGCTGCACTG) (1,GATCCACCCGCCTCGGCCTCC) (1,ATCTTCCCAGAGCTGCCAGCC) (1,TAATTGGCACTGTCCCTGTAA) (1,TTTTGTTTGTTTGTTTTGAGA) (1,CCCTCAAACATCACACTCCAC) (1,AGTTGTAACTTTGAATATCTT) (1,ACTGATAATCCATACTATATT) (1,TATCTTCCTCCCCTGCTGTAT) (1,AGGGTGAAACACCTGTGCCAA) (1,CCCAAATTTTATTTAAGCAAT) (1,TACCAGCACCCAAGTCAAAAA) (1,TTGGATATCCTCCCAAATTTT) (1,CACCCACCATCATGCCCAGCT) (1,CAAAGTGAGGGAGGAATAATT) (1,TTTTTGTATTTTTGTAGAAAC) (1,GTTTCCTGAGGCCTCCCAGGC) (1,GATTGTCCTATTTAATTATTC) (1,CAATGCCTATACAAGTAAAAC) (1,GGGCTCAAGTGATCCATCTCT) (1,TTAATAAATGTTGATTGTCCT) (1,TCAAACTCCTGGGCTCAAGTG) (1,CTTTATTTTTATTTTTAAGGT) (1,TTTTTTGTTTGTTTGTTTTGA) (1,TATCACAGTTATTTAAAGCCT) (1,TTACCAGCACCCAAGTCAAAA) (1,GCCTATACAAGTAAAACTTTA) (1,TAGCATGCATATGGCTAACGG) (1,TTTGTAGAAACGGGGTTTCAC) (1,ATGCATATGGCTAACGGCAAA) (1,TGCAGGTTGAGTAGGATTATT) (1,ACTGGGTCTCACTATGTTGCC) (1,TATTTAAAGCCTTTAAAAAGC) (1,TTTCTTTCTTTCTTTCTCTTT) (1,CCTATTTAATTATTCTCAACT) (1,GTGATCCATCTCTGCCTTCCA) (1,CTCCACGTTCTTCAGCTTTAG) (1,GTAAATTCAAATTGGATATCC) (1,AATGCCCCCATCTTCCCAGAG) (1,TTCAGTTGTAACTTTGAATAT) (1,TGGGCTCAAGTGATCCATCTC) (1,ACAATTAGGTCATTATAAATA) (1,TTTCATCTTATTTTTGGAAAA) (1,GCCTTGCAGGTTGAGTAGGAT) (1,CATTATAAATATATTTAACAG) (1,TTTTGAGCATGAAAGTAATAT) (1,AAGTCAAAAAGAAAAAAAAGG) (1,ACGGGGTTTCACCATGTTGGC) (1,TCACAATGCCAACAGCTAAAT) (1,CAACAGCTAAATGTACCCAAG) (1,GCCTTGCAAGAATCTCTACTG) (1,TCTCACTATGTTGCCTAGGCT) (1,ATAAATGTTGATTGTCCTATT) (1,ATTCCCAGGTCTTAGGTGCTG) (1,CAGCCTTATTCCTATTTATAA) (1,ACAAGATAGTACTTGAGCTAA) (1,ACTCTCATTGACTGTTCAATG) (1,TGTAAATTCAAATTGGATATC) (1,AAAGTTTGGCTTTCAGTTGTA) (1,ATAAGAGCAGCCTTATTCCTA) (1,CAAACTCCTGGGCTCAAGTGA) (1,CAGTGGGAGGTGGTGGCCATG) (1,TAAGGTTTTTTTTGTTTGTTT) (1,CATGAGGCACCGCGCCTGGCC) (1,TCAAACATCACACTCCACGTT) (1,CCGCCTCGGCCTCCCAAAGTG)
scala> kmers.count res44: Long = 1087
scala> val sum0=for((a,b)<-kmers) yield a warning: there were 1 deprecation warning(s); re-run with -deprecation for details sum0: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[18] at map at <console>:29 scala> sum0.sum res45: Double = 1100.0
分析阶段的代码没在源码中
源码来源:
【1】 https://github.com/bigdatagenomics/adam
Adam中的源码:
【2】 org.bdgenomics.adam.rdd.read中的adamCountKmers和org.bdgenomics.adam.cli.CountReadKmersArgs
org.bdgenomics.adam.rdd.read中的adamCountKmers:
def adamCountKmers(kmerLength: Int): RDD[(String, Long)] = { rdd.flatMap(r => { // cut each read into k-mers, and attach a count of 1L r.getSequence .toString .sliding(kmerLength) .map(k => (k, 1L)) }).reduceByKey((k1: Long, k2: Long) => k1 + k2) }
org.bdgenomics.adam.cli.CountReadKmersArgs :
/** * Licensed to Big Data Genomics (BDG) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The BDG licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.bdgenomics.adam.cli import java.util.logging.Level import org.apache.hadoop.mapreduce.Job import org.apache.spark.{ SparkContext, Logging } import org.apache.spark.rdd.RDD import org.bdgenomics.adam.projections.{ AlignmentRecordField, Projection } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.ParquetLogger import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.cli._ import org.kohsuke.args4j.{ Argument, Option => Args4jOption } object CountReadKmers extends BDGCommandCompanion { val commandName = "count_kmers" val commandDescription = "Counts the k-mers/q-mers from a read dataset." def apply(cmdLine: Array[String]) = { new CountReadKmers(Args4j[CountReadKmersArgs](cmdLine)) } } class CountReadKmersArgs extends Args4jBase with ParquetArgs { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to count kmers from", index = 0) var inputPath: String = null @Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts", index = 1) var outputPath: String = null @Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers", index = 2) var kmerLength: Int = 0 @Args4jOption(required = false, name = "-print_histogram", usage = "Prints a histogram of counts.") var printHistogram: Boolean = false @Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to") var repartition: Int = -1 } class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCommand[CountReadKmersArgs] with Logging { val companion = CountReadKmers def run(sc: SparkContext) { // Quiet Parquet... ParquetLogger.hadoopLoggerLevel(Level.SEVERE) // read from disk var adamRecords: RDD[AlignmentRecord] = sc.loadAlignments( args.inputPath, projection = Some(Projection(AlignmentRecordField.sequence))) if (args.repartition != -1) { log.info("Repartitioning reads to '%d' partitions".format(args.repartition)) adamRecords = adamRecords.repartition(args.repartition) } // count kmers val countedKmers = adamRecords.adamCountKmers(args.kmerLength) // cache counted kmers countedKmers.cache() // print histogram, if requested if (args.printHistogram) { countedKmers.map(kv => kv._2.toLong) .countByValue() .toSeq .sortBy(kv => kv._1) .foreach(println) } // save as text file countedKmers.saveAsTextFile(args.outputPath) } }