题目:
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
题解:
用Python Dict直接将每一个长度为10的子串作为key保存出现次数。但Java无法通过memory limit。需要将"A", "C", "G", "T"映射为"0x0", "0x1", "0x2", "0x3",再用int的低20 bit作为key。
C++版:
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { vector<string> result; if(s.length() <= 10) return result; unordered_map<int, int> count; int key = 0; for(int i = 0; i < s.length(); i++) { int current = 0; switch(s[i]) { case 'A': current = 0x0; break; case 'C': current = 0x1; break; case 'G': current = 0x2; break; case 'T': current = 0x3; break; } key = ((key << 2) | current) & 0xFFFFF; if(i < 9) continue; if(count.find(key) == count.end()) { count.insert(pair<int, int>(key, 1)); } else if(count[key] == 1) { result.push_back(s.substr(i-9, 10)); count[key]++; } } return result; } };
public class Solution { public List<String> findRepeatedDnaSequences(String s) { List<String> result = new ArrayList<>(); if(s.length() <= 10) return result; Map<String, Integer> count = new HashMap<>(); for(int i = 0; i <= s.length() - 10; i++) { String current = s.substring(i, i+9); if(!count.containsKey(current)) { count.put(current, 1); } else if(count.get(current) == 1) { result.add(current); count.put(current, 2); } } return result; } }
class Solution: # @param {string} s # @return {string[]} def findRepeatedDnaSequences(self, s): d = {} result = [] if len(s) <= 10: return [] for i in range(len(s)-10+1): sub = s[i:i+10] if sub in d: d[sub] += 1 else: d[sub] = 1 for i in d: if d[i] > 1: result.append(i) return result