trie树找出一堆字符串的前缀机器重复次数。
重点在insert, travel_node, 析构函数
class EmailNameChar { //a-z A-Z 0-9 . - _ 为用户名字符 //0-9, 用0-9表示 //a-z, A-Z用10-35表示, 不区分大小写 //.36 -37 _38 char _c; public: EmailNameChar() { _c = 39; } static bool index(char c, char &idx) { if(c >= 'A' && c <= 'Z') idx = c - 'A' + 10; else if(c >= 'a' && c <= 'z') idx = c - 'a' + 10; else if(c >= 0 && c <= 9) idx = c; else if(c == '.') idx = 36; else if(c == '-') idx = 37; else if(c == '_') idx = 38; else return false; return true; } static bool idxToc(char idx, char &c) { if(idx <= 9) c = idx - 0 + '0'; else if(idx <= 35) c = idx - 10 + 'a'; else if(idx == 36) c = '.'; else if(idx == 37) c = '-'; else if(idx == 38) c = '_'; else return false; return true; } bool set(char c) { if(index(c, _c)) { return true; } else { return false; } } bool get(char &c) const { if(idxToc(_c, c)) return true; return false; } }; const static int num_chars = 39; struct PrefixInfo { int prefixlen; int matchtimes; string result; //prefix PrefixInfo(): prefixlen(0), matchtimes(0), result(), spamscore(-1) {} PrefixInfo(int dep, int c, const string &r): prefixlen(dep), matchtimes(c), result(r),spamscore(-1) {} double spamicity(int tocount, unsigned int maxtolen) { if(spamscore < 0) { if(tocount < PrefixInfo::MINTOCOUNT) spamscore = 0; else if(matchtimes >= PrefixInfo::MINREPEAT && prefixlen >= PrefixInfo::MINPREFIXLEN) spamscore = 1; else spamscore = 1 - (1 - matchtimes / (tocount + 0.0)) * (1 - prefixlen / (maxtolen + 0.0)); } return spamscore; } const static int MINTOCOUNT = 4; const static int MINPREFIXLEN = 5; const static int MINREPEAT = 5; private: double spamscore; }; struct PrefixInfoFilter { virtual bool operator()(const PrefixInfo & l) const {return true;} virtual ~PrefixInfoFilter(){} }; struct PrefixInfoCountFilter: public PrefixInfoFilter { bool operator()(const PrefixInfo & l) const { if(l.matchtimes <= 2) return false; if(l.prefixlen <= 3) return false; return true; } }; class Trie { protected: struct Trie_node { int count; Trie_node *branch[num_chars]; Trie_node() { count = 1; for(int i = 0; i < num_chars; i++) branch[i] = 0; } }; void delete_node(Trie_node * node) { if(node == 0) return; for(int i = 0; i < num_chars; i++) { delete_node(node->branch[i]); } delete node; } public: Trie():root(0), tocount(0), maxtolength(0) { } ~Trie() { delete_node(root); } //-1 failure //1 uniq word int insert(const string &to) { int np; if((np = to.find('@')) == string::npos) { cerr<<"invalid to address: " << to << endl; return -1; //invalid to address } string name = to.substr(0, np); const char *word = name.c_str(); int result = 1, position = 0; if(root == 0) root = new Trie_node; char char_code; Trie_node *location = root; while(location != 0 && *word != 0) { if(EmailNameChar::index(*word, char_code) == false) { cerr<<"invalid char: " << int(*word) <<". in to address: " << to<< endl; return -1; } if(location->branch[char_code] == 0) location->branch[char_code] = new Trie_node; else location->branch[char_code]->count++; location = location->branch[char_code]; position++; word++; } if(to.length() > maxtolength) maxtolength = to.length(); ++tocount; return result; } int travel(vector<PrefixInfo> &vpi, const PrefixInfoFilter& f) const { if(root == 0) return -1; char result[maxtolen]; for(int i = 0; i < num_chars; i++) travel_node(vpi, i, root->branch[i], 0, f, result); return tocount; } int gettocount() const { return tocount; } int getmaxtolength() const { return maxtolength; } private: void travel_node(vector<PrefixInfo> &vpi, int idx, const Trie_node * const node, int depth, const PrefixInfoFilter& f, char *result = 0) const { if(node == 0) return; if(depth >= maxtolen - 1) return; if(result) { char c; if(EmailNameChar::idxToc(idx, c) == false) { assert(0); } result[depth] = c, result[depth+1] = 0; } ++depth; bool isPrefix = true; bool isTail = true; for(int i = 0; i < num_chars; i++) { if(node->branch[i]) { isTail = false; if(node->count <= node->branch[i]->count) { isPrefix = false; break; } } } PrefixInfo curnode(depth, node->count, result); if(!isTail && isPrefix && f(curnode)) vpi.push_back(curnode); for(int i = 0; i < num_chars; i++) { travel_node(vpi, i, node->branch[i], depth, f, result); } } protected: Trie_node *root; //根 int tocount; //树中已有用户个数 unsigned int maxtolength; //树中已知用户名的最大长度 const static int maxtolen = 200; //一个用户名的最大长度 };