[LeetCode] Repeated DNA Sequences hash map

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",



Return:

["AAAAACCCCC", "CCCCCAAAAA"].

 

Hide Tags
  Hash Table Bit Manipulation
 

  C++ 标准模板库不常用就容易忘,这个就是用hash map 做一个大表统计的,但是直接unordered_map<string, int > 这样会爆内存。
class Solution {

public:

    vector<string> findRepeatedDnaSequences(string s) {

        unordered_map<string,int > mp;

        int len = s.length(),curIdx = 0;

        string curStr;

        vector<string >ret;

        while(curIdx + 10<=len){

            curStr = s.substr(curIdx,10);

            if(mp.find(curStr)!=mp.end()){

                ret.push_back(curStr);

            }

            else

                mp[curStr] = 1;

            curIdx ++;

        }

        return ret;

    }

};

 

  处理方法是 可以是将其改为 unordered_map<int ,int >,通过 4进制的转换。另外更可以通过 bitset 再次降低内存,最后需要考虑重复问题,如果用 unordered_map 可以直接标记时候已经添加到返回vector 中了, 用 bitset 可以通过 临时变量 set<string> 存储,最后生成返回的  vector。

 

#include <iostream>

#include <string>

#include <vector>

#include <unordered_map>

#include <bitset>

#include <set>

using namespace std;



//class Solution {

//public:

//    vector<string> findRepeatedDnaSequences(string s) {

//        unordered_map<string,int > mp;

//        int len = s.length(),curIdx = 0;

//        string curStr;

//        vector<string >ret;

//        while(curIdx + 10<=len){

//            curStr = s.substr(curIdx,10);

//            if(mp.find(curStr)!=mp.end()){

//                ret.push_back(curStr);

//            }

//            else

//                mp[curStr] = 1;

//            curIdx ++;

//        }

//        return ret;

//    }

//};



class Solution {

public:

    vector<string> findRepeatedDnaSequences(string s) {

        bitset<1048576> bst;

        bst.reset();

        set<string > ret;

        int sum=0;

        for(int i =0;i<10;i++)

            sum = sum*4 + helpFun(s[i]);

        bst.set(sum);

        for( int i=10;i<s.length();i++){

            sum%=262144;

            sum = sum*4 + helpFun(s[i]);

            if(bst[sum])

                ret.insert(s.substr(i-9,10));

            else

                bst.set(sum);

        }

        return vector<string>(ret.begin(),ret.end());

    }



    int helpFun(char c)

    {

        switch(c){

            case 'A':   return 0;

            case 'C':   return 1;

            case 'G':   return 2;

            case 'T':   return 3;

        }

    }

};



int main()

{

    string s= "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT";

    Solution sol;

    vector<string > ret = sol.findRepeatedDnaSequences(s);

    for(int i=0;i<ret.size();i++)

        cout<<ret[i]<<endl;

    return 0;

}

 

 

你可能感兴趣的:(LeetCode)