问题出自blog:http://blog.csdn.net/v_july_v/article/details/6803368
问题:高效统计一篇英文文章里出现的所有单词,按照在文章中首次出现的顺序打印出该单词和它出现的次数。
解决方式:利用Trie完成单词匹配,然后利用链表来统计单词出现的个数。
源代码:
1 #include <stdio.h> 2 #include <stdlib.h> // for calloc(), free() 3 #include <string.h> // for strlen(), memset() 4 5 enum { BranchSize = 26, StringSize = 40, NodeMax = 200 }; // 声明常量 6 7 /* 链表信息以及相关操作函数 */ 8 struct ListNode 9 { 10 int m_iCnt; // 统计单词出现的次数 11 char m_szStr[StringSize];// 当前链表项中单词 12 struct ListNode *m_pNxt; // 指向下一个链表结点 13 }; 14 15 struct ListHead 16 { 17 struct ListNode *m_pStart; // 指向链表开始结点 18 struct ListNode *m_pEnd; // 指向链表末尾结点 19 }; 20 21 typedef struct ListNode ListNode; 22 typedef struct ListHead ListHead; 23 24 // 分配头结点 25 ListHead* AllocListHead() 26 { 27 ListHead *pNew = NULL; 28 29 pNew = (ListHead *) calloc( 1, sizeof( ListHead ) ); 30 if ( NULL == pNew ) 31 { 32 printf( "Out of Memory.\n" ); 33 return NULL; 34 } 35 36 pNew->m_pEnd = NULL; 37 pNew->m_pStart = NULL; 38 39 return pNew; 40 } 41 42 // 分配链表结点 43 ListNode* AllocListNode( const char *szStr ) 44 { 45 ListNode *pNew = NULL; 46 47 pNew = (ListNode *)calloc( 1, sizeof(ListNode) ); 48 if ( NULL == pNew ) 49 { 50 printf( "Out of Memory.\n" ); 51 return NULL; 52 } 53 54 // 初始化信息 55 pNew->m_iCnt = 1; 56 pNew->m_pNxt = NULL; 57 strncpy( pNew->m_szStr, szStr, strlen( szStr ) ); 58 pNew->m_szStr[strlen(szStr)] = '\0'; 59 60 return pNew; 61 } 62 63 // 插入链表 64 int InsertNodeIntoList( ListHead *pHead, const char *szStr ) 65 { 66 ListNode *pStart = pHead->m_pStart, 67 *pNew = NULL; 68 69 // 检查参数 70 if ( NULL == szStr ) 71 { 72 printf( "The string is null.\n" ); 73 return -1; 74 } 75 76 // 分配新节点 77 pNew = AllocListNode( szStr ); 78 if ( NULL == pNew ) 79 { 80 return -1; 81 } 82 83 // 将结点插入链表尾部 84 if ( pStart != NULL ) 85 { 86 pHead->m_pEnd->m_pNxt = pNew; 87 pHead->m_pEnd = pNew; 88 } 89 else 90 { 91 pHead->m_pStart = pNew; 92 pHead->m_pEnd = pNew; 93 } 94 95 return 1; 96 } 97 98 // 摧毁链表 99 void DestoryList( ListHead **pHead ) 100 { 101 ListNode *pStart = (*pHead)->m_pStart, 102 *pFree = NULL; 103 104 if ( NULL == pHead ) 105 return; 106 107 while ( pStart ) 108 { 109 pFree = pStart; 110 pStart= pStart->m_pNxt; 111 free( pFree ); 112 pFree = NULL; 113 } 114 115 free( *pHead ); 116 *pHead = NULL; 117 } 118 119 // 输出链表中信息 120 void OutputList( ListHead *pHead ) 121 { 122 ListNode *pStart = pHead->m_pStart; 123 int sum = 0; 124 125 printf( "About statistic:\n"); 126 while ( pStart ) 127 { 128 sum += pStart->m_iCnt; 129 printf( "%s:\t\t%d\n", pStart->m_szStr, pStart->m_iCnt ); 130 pStart = pStart->m_pNxt; 131 } 132 printf( "The total words is %d.\n", sum ); 133 printf( "\n" ); 134 } 135 136 /* Trie树结构体以及相关操作函数 */ 137 struct TrieNode 138 { 139 int m_iIsStr; // 记录此处是否构成一个字符串。 140 struct TrieNode *m_pBranch[BranchSize]; // 指向各个子树的指针,小标0-25代表26个字符 141 struct ListNode *m_pCountInfo; // 指向该单词的统计信息结点 142 }; 143 144 typedef struct TrieNode TrieNode; 145 146 // 分配Trie树的新节点 147 TrieNode* AllocTrieNode() 148 { 149 TrieNode *pNew = NULL; 150 int idx = 0; 151 152 pNew = (TrieNode *) calloc( 1, sizeof( TrieNode ) ); 153 if ( NULL == pNew ) 154 { 155 printf( "Out of memory.\n" ); 156 return NULL; 157 } 158 159 // initialize information. 160 for ( ; idx < BranchSize; ++idx ) 161 pNew->m_pBranch[idx] = NULL; 162 pNew->m_pCountInfo = NULL; 163 pNew->m_iIsStr = 0; 164 165 return pNew; 166 } 167 168 // 在Trie树中查找单词 169 int SearchNodeInTrie( TrieNode *pRoot, 170 const char *word ) 171 { 172 TrieNode *pStart = pRoot; 173 174 while ( *word && pStart ) 175 { 176 pStart = pStart->m_pBranch[*word - 'a']; 177 ++word; 178 } 179 180 // 在Trie树中找到szStr,则更新结点信息。 181 if ( pStart != NULL && pStart->m_iIsStr ) 182 { 183 pStart->m_pCountInfo->m_iCnt++; 184 return 1; 185 } 186 187 return 0; 188 } 189 190 // 插入单词到Trie树中 191 int InsertNodeIntoTrie( TrieNode *pRoot, 192 ListHead *pStart, 193 const char *szStr ) 194 { 195 TrieNode *location = pRoot; 196 const char *word = szStr; 197 198 if ( SearchNodeInTrie( pRoot, szStr) == 1 ) 199 return 0; 200 201 while ( *szStr ) 202 { 203 if ( location->m_pBranch[*szStr - 'a'] == NULL ) // 不存在 204 { 205 TrieNode *pNew = AllocTrieNode(); 206 if ( NULL == pNew ) 207 return -1; 208 location->m_pBranch[*szStr - 'a'] = pNew; 209 } 210 // 每插入一步,相当于一个新串经过,指针要向下移动 211 location = location->m_pBranch[*szStr - 'a']; 212 ++szStr; 213 } 214 location->m_iIsStr = 1; 215 if ( InsertNodeIntoList( pStart, word ) == 1 ) 216 { 217 location->m_pCountInfo = pStart->m_pEnd; 218 return 1; 219 } 220 221 return 0; 222 } 223 224 // 摧毁Trie树 225 void DestoryTrie( TrieNode **pRoot ) 226 { 227 TrieNode *TrieStack[NodeMax], 228 *pNxt = NULL, 229 *root = *pRoot; 230 int top = 0, 231 idx = 0; 232 233 // Initialize stack 234 for ( ; idx < NodeMax; ++idx ) 235 TrieStack[idx] = NULL; 236 237 for ( idx = 0; idx < BranchSize; ++idx ) 238 { 239 if ( root->m_pBranch[idx] != NULL ) 240 TrieStack[top++] = root->m_pBranch[idx]; 241 } 242 243 // 遍历Trie树,并删除 244 while ( top ) 245 { 246 pNxt = TrieStack[--top]; 247 248 for ( idx = 0; idx < BranchSize; ++idx ) 249 { 250 if ( pNxt->m_pBranch[idx] != NULL ) 251 TrieStack[top++] = pNxt->m_pBranch[idx]; 252 } 253 254 free( pNxt ); 255 pNxt = NULL; 256 } 257 258 free( *pRoot ); 259 *pRoot = NULL; 260 } 261 262 void TestFunction() 263 { 264 const char *pszStrs[9] = 265 { 266 "hello", "word", "hi", 267 "hello", "hello","hi", 268 "word", "word", "word" 269 }; 270 int idx = 0; 271 TrieNode *pTrie = NULL; 272 ListHead *pList = NULL; 273 274 pTrie = AllocTrieNode(); 275 if ( NULL == pTrie ) 276 return; 277 pList = AllocListHead(); 278 if ( NULL == pList ) 279 { 280 DestoryTrie( &pTrie ); 281 return; 282 } 283 284 for ( idx = 0; idx < 9; ++idx ) 285 { 286 InsertNodeIntoTrie( pTrie, pList, pszStrs[idx] ); 287 } 288 289 OutputList( pList ); 290 291 DestoryTrie( &pTrie ); 292 DestoryList( &pList ); 293 } 294 295 int main() 296 { 297 TestFunction(); 298 299 return 0; 300 }
Trie树源码参考blog:http://www.cnblogs.com/cherish_yimi/archive/2009/10/12/1581666.html