2015.2.10 05:44
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
Solution:
This problem is a good example to teach you how to improve things step by step. My first attempt was simply a brute-force solution, using substr().
The result was, of course, MLE.
Next step would be to realize that ACGT can be marked as 0123, thus allowing only 1048576 possibilities for a 10-letter string.
With this the time bound is lowered to acceptable level.
And the result was, still MLE.
Well then, if a[1048576] is too much to bargain for, let unordered_map<int, int> take the job.
That was the right path, with a good AC to conclude.
Time and space complexity are both linear to the length of the string.
Accepted code:
1 //#define ZZ 2 #include <iostream> 3 #include <string> 4 #include <unordered_map> 5 #include <vector> 6 using namespace std; 7 8 class Solution { 9 public: 10 vector<string> findRepeatedDnaSequences(string s) { 11 vector<string> res; 12 13 res.clear(); 14 int len = s.length(); 15 int i; 16 17 if (len < 10) { 18 return res; 19 } 20 21 int sum = 0; 22 for (i = 0; i < 10; ++i) { 23 sum = (sum << 2) + dg(s[i]); 24 } 25 ++c[sum]; 26 27 for (i = 1; i + 10 <= len; ++i) { 28 sum = ((sum << 2) & 1048575) + dg(s[i + 9]); 29 ++c[sum]; 30 } 31 32 string ss; 33 int j; 34 35 ss.resize(10); 36 unordered_map<int, int>::iterator it; 37 for (it = c.begin(); it != c.end(); ++it) { 38 if (it->second < 2) { 39 continue; 40 } 41 sum = it->first; 42 for (j = 9; j >= 0; --j) { 43 ss[9 - j] = gd(sum >> 2 * j); 44 sum &= ((1 << 2 * j) - 1); 45 } 46 res.push_back(ss); 47 } 48 c.clear(); 49 return res; 50 } 51 private: 52 unordered_map<int, int> c; 53 54 int dg(char ch) { 55 if (ch == 'A') { 56 return 0; 57 } else if (ch == 'C') { 58 return 1; 59 } else if (ch == 'G') { 60 return 2; 61 } else if (ch == 'T') { 62 return 3; 63 } else { 64 return 0; 65 } 66 } 67 68 char gd(int d) 69 { 70 if (d == 0) { 71 return 'A'; 72 } else if (d == 1) { 73 return 'C'; 74 } else if (d == 2) { 75 return 'G'; 76 } else if (d == 3) { 77 return 'T'; 78 } else { 79 return 0; 80 } 81 } 82 }; 83 #ifdef ZZ 84 int main() 85 { 86 Solution sol; 87 string s; 88 vector<string> res; 89 int i; 90 91 while (cin >> s) { 92 res = sol.findRepeatedDnaSequences(s); 93 for (i = 0; i < res.size(); ++i) { 94 cout << res[i] << endl; 95 } 96 res.clear(); 97 } 98 99 return 0; 100 } 101 #endif