[算法 笔记] 统计给定文章的单词个数

  问题出自blog:http://blog.csdn.net/v_july_v/article/details/6803368

  问题:高效统计一篇英文文章里出现的所有单词,按照在文章中首次出现的顺序打印出该单词和它出现的次数。

  解决方式:利用Trie完成单词匹配,然后利用链表来统计单词出现的个数。

  源代码:

  

  1 #include <stdio.h>

  2 #include <stdlib.h>     // for calloc(), free()

  3 #include <string.h>     // for strlen(), memset()

  4 

  5 enum { BranchSize = 26, StringSize = 40, NodeMax = 200 }; // 声明常量

  6 

  7 /* 链表信息以及相关操作函数 */

  8 struct ListNode

  9 {

 10     int     m_iCnt;             // 统计单词出现的次数

 11     char    m_szStr[StringSize];// 当前链表项中单词

 12     struct ListNode *m_pNxt;    // 指向下一个链表结点

 13 };

 14 

 15 struct ListHead

 16 {

 17     struct ListNode *m_pStart;  // 指向链表开始结点

 18     struct ListNode *m_pEnd;    // 指向链表末尾结点

 19 };

 20 

 21 typedef struct ListNode ListNode;

 22 typedef struct ListHead ListHead;

 23 

 24 // 分配头结点

 25 ListHead* AllocListHead()

 26 {

 27     ListHead *pNew = NULL;

 28 

 29     pNew = (ListHead *) calloc( 1, sizeof( ListHead ) );

 30     if ( NULL == pNew )

 31     {

 32         printf( "Out of Memory.\n" );

 33         return NULL;

 34     }

 35 

 36     pNew->m_pEnd        = NULL;

 37     pNew->m_pStart      = NULL;

 38 

 39     return pNew;

 40 }

 41 

 42 // 分配链表结点

 43 ListNode* AllocListNode( const char *szStr )

 44 {

 45     ListNode *pNew = NULL;

 46 

 47     pNew = (ListNode *)calloc( 1, sizeof(ListNode) );

 48     if ( NULL == pNew )

 49     {

 50         printf( "Out of Memory.\n" );

 51         return NULL;

 52     }

 53 

 54     // 初始化信息

 55     pNew->m_iCnt = 1;

 56     pNew->m_pNxt = NULL;

 57     strncpy( pNew->m_szStr, szStr, strlen( szStr ) );

 58     pNew->m_szStr[strlen(szStr)] = '\0';

 59 

 60     return pNew;

 61 }

 62 

 63 // 插入链表

 64 int InsertNodeIntoList( ListHead *pHead, const char *szStr )

 65 {

 66     ListNode *pStart = pHead->m_pStart,

 67               *pNew   = NULL;

 68 

 69     // 检查参数

 70     if ( NULL == szStr )

 71     {

 72         printf( "The string is null.\n" );

 73         return -1;

 74     }

 75 

 76     // 分配新节点

 77     pNew = AllocListNode( szStr );

 78     if ( NULL == pNew )

 79     {

 80         return -1;

 81     }

 82 

 83     // 将结点插入链表尾部

 84     if ( pStart != NULL )

 85     {

 86         pHead->m_pEnd->m_pNxt = pNew;

 87         pHead->m_pEnd = pNew;

 88     }

 89     else

 90     {

 91         pHead->m_pStart = pNew;

 92         pHead->m_pEnd = pNew;

 93     }

 94 

 95     return 1;

 96 }

 97 

 98 // 摧毁链表

 99 void DestoryList( ListHead **pHead )

100 {

101     ListNode *pStart = (*pHead)->m_pStart,

102               *pFree  = NULL;

103 

104     if ( NULL == pHead )

105         return;

106 

107     while ( pStart )

108     {

109         pFree = pStart;

110         pStart= pStart->m_pNxt;

111         free( pFree );

112         pFree = NULL;

113     }

114 

115     free( *pHead );

116     *pHead = NULL;

117 }

118 

119 // 输出链表中信息

120 void OutputList( ListHead *pHead )

121 {

122     ListNode *pStart = pHead->m_pStart;

123     int sum = 0;

124 

125     printf( "About statistic:\n");

126     while ( pStart )

127     {

128         sum += pStart->m_iCnt;

129         printf( "%s:\t\t%d\n", pStart->m_szStr, pStart->m_iCnt );

130         pStart = pStart->m_pNxt;

131     }

132     printf( "The total words is %d.\n", sum );

133     printf( "\n" );

134 }

135 

136 /* Trie树结构体以及相关操作函数 */

137 struct TrieNode

138 {

139     int     m_iIsStr;       // 记录此处是否构成一个字符串。

140     struct TrieNode *m_pBranch[BranchSize]; // 指向各个子树的指针,小标0-25代表26个字符

141     struct ListNode *m_pCountInfo;  // 指向该单词的统计信息结点

142 };

143 

144 typedef struct TrieNode TrieNode;

145 

146 // 分配Trie树的新节点

147 TrieNode* AllocTrieNode()

148 {

149     TrieNode *pNew = NULL;

150     int idx = 0;

151 

152     pNew = (TrieNode *) calloc( 1, sizeof( TrieNode ) );

153     if ( NULL == pNew )

154     {

155         printf( "Out of memory.\n" );

156         return NULL;

157     }

158 

159     // initialize information.

160     for ( ; idx < BranchSize; ++idx )

161         pNew->m_pBranch[idx] = NULL;

162     pNew->m_pCountInfo  = NULL;

163     pNew->m_iIsStr      = 0;

164 

165     return pNew;

166 }

167 

168 // 在Trie树中查找单词

169 int SearchNodeInTrie( TrieNode *pRoot,

170                       const char *word )

171 {

172     TrieNode *pStart = pRoot;

173 

174     while ( *word && pStart )

175     {

176         pStart = pStart->m_pBranch[*word - 'a'];

177         ++word;

178     }

179 

180     // 在Trie树中找到szStr,则更新结点信息。

181     if ( pStart != NULL && pStart->m_iIsStr )

182     {

183         pStart->m_pCountInfo->m_iCnt++;

184         return 1;

185     }

186 

187     return 0;

188 }

189 

190 // 插入单词到Trie树中

191 int InsertNodeIntoTrie( TrieNode *pRoot,

192                         ListHead *pStart,

193                         const char *szStr )

194 {

195     TrieNode *location  = pRoot;

196     const char *word    = szStr;

197 

198     if ( SearchNodeInTrie( pRoot, szStr) == 1 )

199         return 0;

200 

201     while ( *szStr )

202     {

203         if ( location->m_pBranch[*szStr - 'a'] == NULL ) // 不存在

204         {

205             TrieNode *pNew = AllocTrieNode();

206             if ( NULL == pNew )

207                 return -1;

208             location->m_pBranch[*szStr - 'a'] = pNew;

209         }

210         // 每插入一步,相当于一个新串经过,指针要向下移动

211         location = location->m_pBranch[*szStr - 'a'];

212         ++szStr;

213     }

214     location->m_iIsStr = 1;

215     if ( InsertNodeIntoList( pStart, word ) == 1 )

216     {

217         location->m_pCountInfo = pStart->m_pEnd;

218         return 1;

219     }

220 

221     return 0;

222 }

223 

224 // 摧毁Trie树

225 void DestoryTrie( TrieNode **pRoot )

226 {

227     TrieNode *TrieStack[NodeMax],

228              *pNxt  = NULL,

229              *root  = *pRoot;

230     int top     = 0,

231         idx     = 0;

232 

233     // Initialize stack

234     for ( ; idx < NodeMax; ++idx )

235         TrieStack[idx] = NULL;

236 

237     for ( idx = 0; idx < BranchSize; ++idx )

238     {

239         if ( root->m_pBranch[idx] != NULL )

240             TrieStack[top++] = root->m_pBranch[idx];

241     }

242 

243     // 遍历Trie树,并删除

244     while ( top )

245     {

246         pNxt = TrieStack[--top];

247 

248         for ( idx = 0; idx < BranchSize; ++idx )

249         {

250             if ( pNxt->m_pBranch[idx] != NULL )

251                 TrieStack[top++] = pNxt->m_pBranch[idx];

252         }

253 

254         free( pNxt );

255         pNxt = NULL;

256     }

257 

258     free( *pRoot );

259     *pRoot = NULL;

260 }

261 

262 void TestFunction()

263 {

264     const char *pszStrs[9] =

265         {

266             "hello", "word", "hi",

267             "hello", "hello","hi",

268             "word", "word", "word"

269         };

270     int idx = 0;

271     TrieNode *pTrie = NULL;

272     ListHead *pList = NULL;

273 

274     pTrie = AllocTrieNode();

275     if ( NULL == pTrie )

276         return;

277     pList = AllocListHead();

278     if ( NULL == pList )

279     {

280         DestoryTrie( &pTrie );

281         return;

282     }

283 

284     for ( idx = 0; idx < 9; ++idx )

285     {

286         InsertNodeIntoTrie( pTrie, pList, pszStrs[idx] );

287     }

288 

289     OutputList( pList );

290 

291     DestoryTrie( &pTrie );

292     DestoryList( &pList );

293 }

294 

295 int main()

296 {

297     TestFunction();

298 

299     return 0;

300 }
View Code

 

  Trie树源码参考blog:http://www.cnblogs.com/cherish_yimi/archive/2009/10/12/1581666.html

你可能感兴趣的:(算法)