某同学的同学摩根电面的一道题目:
有一个大的text file, 里面很多英文单词,查找重复出现的单词;
题目的简单:全部遍历太慢,考官说太慢,为什么不使用stl的map实现呢?
如果不是单词呢?如果是url呢?为什么不使用hash_map呢?Good!
如果还是单词,能不能再快点呢?能,使用强大的Trie吧!
使用stl的map实现:
#pragma warning(disable : 4786) #include <iostream> #include <string> #include <map> using namespace std; int main() { map<string, int> countMap; // pair<map<string, int>::iterator, bool> Insert_Pair; map<string, int>::iterator it; string words[] = {"a","b","a","a","e","a","d","a","e","a","c"} ; for (int i=0; i<sizeof(words)/sizeof(string); i++) { Insert_Pair = countMap.insert(pair<string , int>(words[i], 1)); if (!Insert_Pair.second) //插入未成功,说明有重复,计数增一 { Insert_Pair.first->second++; } } for (it = countMap.begin(); it != countMap.end(); ++it) { if (it->second >1) { cout<<it->first<<endl; } } return 0; }
使用Trie实现,这里只是提供一个简单的版本,只处理小写的单词:
1 #include <iostream> 2 3 using namespace std; 4 5 typedef struct _TrieNode 6 { 7 char c; 8 unsigned int cnt; 9 struct _TrieNode* next[26]; 10 }TrieNode; 11 12 TrieNode* CreateTrieNode(char c) 13 { 14 TrieNode* p; 15 p = (TrieNode*) malloc(sizeof(TrieNode)); 16 17 if (!p) 18 { 19 return p; 20 } 21 p->c = c; 22 p->cnt = 0; 23 memset(p->next, 0 , sizeof(TrieNode*)*26); 24 25 return p; 26 } 27 28 bool InsertTrie(TrieNode* root, char str[]) 29 { 30 int len,j; 31 TrieNode * p; 32 33 len = strlen(str); 34 if (len<=0 || !root) 35 { 36 return false; 37 } 38 39 p = root; 40 for (int i=0; i<len; i++) 41 { 42 j = str[i] - 'a'; 43 if (j<0 || j>=26) 44 { 45 return false; 46 } 47 if (p->next[j] == NULL) 48 { 49 p->next[j] = CreateTrieNode(str[i]); 50 } 51 p = p->next[j]; 52 } 53 p->cnt++; 54 55 return p->cnt>1 ? true: false; 56 } 57 58 int main() 59 { 60 char *words[] = {"a","bb","aba","abc","ea","ac","dbb","a","ea","ac","cc"}; 61 62 TrieNode * root; 63 64 root = CreateTrieNode('R'); 65 66 for (int i=0; i< sizeof(words)/sizeof(char*); i++) 67 { 68 if (InsertTrie(root, words[i])) 69 { 70 cout<<words[i]<<endl; 71 } 72 } 73 74 return 0; 75 }
看了Trie相关文章得到的提示:http://blog.csdn.net/v_july_v/article/details/6897097
Tire比hash的效率高~
文中:
毕