作业要求:
参见博客:http://www.cnblogs.com/jiel/p/3311400.html
设计思路:
定义一个词典类,实现查找,插入,排序,输出等操作。
对于扩展要求-e,通过整合合并词典操作即可。
已经实现功能:
- 统计词频,按照出现次数由高到低输出到文件;
- 普通操作中,大小写不同算作同一个单词,按照ASCII序列统计输出;
- -e 操作中,尾缀(仅数字)不一样算作同一单词,按照ASCII序列统计输出。
代码:
GitHub: https://github.com/chynphh/Word-frequency-program
#include#include <string> #include #include #include using namespace std; const int LIST_MAX = 10000; static int num = 0; //-e 功能中 提取前面部分 string extract(string w) { string s; int l = w.size(), i; for(i = l - 1; i >= 0; i--) { if(w[i] > 57)break; } s = w.substr(0, i + 1); return s; } //词典类 class Dictionary { public: int find(string w); //查找是否有这个单词 void put(string w, int loc); //放入词典 void sort(); //词典排序 void out(); // 词典输出 int merge(); //整合词典 -e功能 private: string word[LIST_MAX]; int freq[LIST_MAX] = {0}; }; int Dictionary::find(string w) { string w_lower, word_lower; w_lower = w; transform(w_lower.begin(), w_lower.end(), w_lower.begin(), ::tolower); for(int i = 0; i < num; i++) { word_lower = word[i]; transform(word_lower.begin(), word_lower.end(), word_lower.begin(), ::tolower); if(w_lower.compare(word_lower) == 0) return i; } return -1; } void Dictionary::put(string w, int loc) { freq[loc]++; if(loc == num) { word[loc] = w; num++; } if(w.compare(word[loc]) < 0) word[loc].assign(w); return ; } void Dictionary::sort() { for(int j = 1; j < num; j++)//进行n-1次循环,n-i趟比较 for(int i = 0; i < num - j;i++)//每趟进行n-i-j次比较 { if(freq[i] < freq[i+1])//相邻两数进行比较 { int t = freq[i]; freq[i] = freq[i+1]; freq[i+1] = t; string ts = word[i]; word[i] = word[i+1]; word[i+1] = ts; } else if(freq[i] == freq[i+1]) { if(word[i].compare("") == 0 || word[i].compare(word[i+1]) > 0) { int t = freq[i]; freq[i] = freq[i+1]; freq[i+1] = t; string ts = word[i]; word[i] = word[i+1]; word[i+1] = ts; } } } return ; } void Dictionary::out() { ofstream fout("output.txt"); if(!fout.is_open()) { cout << "file can not open" << endl; } else { for(int i = 0; i < num; i++) fout << word[i] << " : " << freq[i] << endl; fout.close(); } return ; } int Dictionary::merge() { int sum = 0; string s1, s2; for(int i = 0; i < num - 1; i++) { s1 = extract(word[i]); transform(s1.begin(), s1.end(), s1.begin(), ::tolower); for(int j = i + 1; j < num; j++) { s2 = extract(word[j]); transform(s2.begin(), s2.end(), s2.begin(), ::tolower); if(s1.compare(s2) == 0) { freq[j] = freq[j] + freq[i]; freq[i] = 0; if(word[i].compare(word[j]) < 0) { word[j] = word[i]; word[i] = ""; } sum++; break; } } } return sum; } // 提取字符串中的第一个单词 string check(string *s) { string b; int l = 0, start = -1, end = 0, flag = 1; l =(*s).size(); if(l < 4) return ""; for(int i = 0; (i < l) && flag; i++) { if( (*s)[i] < 48 || ((*s)[i] > 57 && (*s)[i] < 65) || ((*s)[i] > 90 && (*s)[i] < 97) || (*s)[i] > 122) { if( start > -1) { end = i; flag = 0; } } else if( start == -1 && (*s)[i] > 57) { start = i; } } if(end == 0 && start > -1) { end = l; flag = 0; } if(!flag) { b = (*s).substr(start, end - start); (*s).assign((*s).substr(end, end - l)); if(b.size() < 4) b = check(s); } else { b = ""; (*s).assign(""); } return b; } int main(int argc, char const *argv[]) { Dictionary dictionary; string path_in, s, w = ""; int loc = 0; ifstream fin (argv[argc-1]); //ifstream fin ("test.txt"); if(!fin.is_open()) { cout << "file can not open" << endl; } else { while(! fin.eof() ) { getline(fin, s); w = check(&s); while(w != "") { loc = dictionary.find(w); if(loc == -1) dictionary.put(w, num); else dictionary.put(w, loc); w = check(&s); } } int cut = 0; if(argc == 3) { cut = dictionary.merge(); } dictionary.sort(); num = num - cut; dictionary.out(); fin.close(); } return 0; }
数据:
GitHub上有10组测试数据,input1-4是china daily中的新闻,input5和input6是普通英语作文,input7-10是为了测试功能自编数据(包括分隔符、大小写、尾缀等测试)。output-e*.txt是-e功能下的输出,output*.txt是普通输出,均与输入数据对应。
使用说明:
命令行输入:词频统计.exe -e filename.txt 或者 词频统计.exe filename.txt
结果将输出至:output.txt
输出截图:
用时:
原计划全部用时3小时,实际用时5小时。
分工:
程昊:代码编写与博客编写
陈金满:代码测试