LeetCode Repeated DNA Sequences

 1 class Solution {

 2 private:

 3     char tbl[256];

 4 public:

 5     vector<string> findRepeatedDnaSequences(string s) {

 6         vector<string> res;

 7         

 8         int len = s.size();

 9         if (len < 10) {

10             return res;

11         }

12         vector<bool> exist(1<<20, false);

13         vector<bool> add(1<<20, false);

14         

15         tbl['A'] = 0x00;

16         tbl['C'] = 0X01;

17         tbl['G'] = 0x02;

18         tbl['T'] = 0x03;

19         

20         int mask= (1<<20) - 1;

21         int pattern = 0;

22         

23         for (int i=0; i<10; i++) {

24             pattern = mask & ((pattern << 2) | tbl[s[i]]);

25         }

26         exist[pattern] = true;

27         

28         for (int i=10; i<len; i++) {

29             int start = i - 10 + 1;

30             pattern = mask & ((pattern << 2) | tbl[s[i]]);

31             if (exist[pattern] && !add[pattern]) {

32                 res.push_back(s.substr(start, 10));

33                 add[pattern] = true;

34             } else {

35                 exist[pattern] = true;

36             }

37         }

38         return res;

39     }

40 };

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",



Return:

["AAAAACCCCC", "CCCCCAAAAA"].

将10个连续的DNA碱基序列看着是一个10位4进制的数,这样的数共有4^10=2^20个。用两个vector<bool>来分别表示,是否存在,是否已经添加到结果中即可。

第二轮:

简化一下用一个hashmap去存,不过发现速度下降很多,因为vector<bool>是一个特化模板实际只占一个bit的空间,相比用整数来存状态空间少很多,而且2^20个数据也就1MB个bit几百KB的空间占用,综合起来这个版本反而倒退了,卧槽:

class Solution {

public:

    vector<string> findRepeatedDnaSequences(string s) {

        // 2bit * 10 = 20bit

        unordered_map<int, int> cache;

        vector<string> res;

        

        int mappings[256];

        mappings['A'] = 0x0;

        mappings['C'] = 0x1;

        mappings['G'] = 0x2;

        mappings['T'] = 0x3;

        

        int hash = 0;

        int mask = 0x000fffff;

        

        int pos = 0;

        int len = s.size();

        while (pos < 10) {

            hash = (hash<<2) | mappings[s[pos++]];     

        }

        cache[hash]++;

        while (pos < len) {

            hash = mask & ((hash << 2) | mappings[s[pos++]]);

            if (cache[hash] > 0) {

                res.push_back(s.substr(pos - 10, 10));

                cache[hash] = -1;

            } else if (cache[hash] == 0){

                cache[hash]++;

            }

        }

        return res;

    }

};

 

你可能感兴趣的:(LeetCode)