使用C++实现mmseg,对中文句子进行分词

想要了解更多,可以看我在OSCHINA分享的代码,

http://www.oschina.net/action/code/download?code=7969&id=12697


这里我的这段代码为了实现的简单化,假设传进来的参数都是中文字符串,也就是不含有字母数字类似的东西。 


例如 string test="研究生命起源"; 
分词效果为 : 研究 生命 起源,还算正确。 
初始化过程中,有很多文件读取的操作,其中有一个文件存储了我们需要的词库,另外还有一个词频的文件。 
主要的算法采用递归来实现,主要是考虑到实现的方便性,大家如果觉得实在受不了递归,可以考虑修改出一个非递归的版本。采用递归的方法实现,于是就导致了当一个串很长很长的时候需要很长的时间才能做好分词,似乎,已经慢的快到了不能忍的地步了。时间复杂度接近2^n,n为字符串的中文字的个数。 

PS:在Ubuntu下的eclipse编译通过的代码,这里一个中文字是三个字节,如果在windows下Visual Studio,是两个字节,这个我在开头通过测试一个字符串的长度来获取,这样用稍微变通的方法让这个可以在不同平台直接使用。 

代码见附件。里面有词库加上代码。

/*
 * buildTerms.cpp
 *
 *  Created on: Dec 21, 2011
 *      Author: lai_conglin
 */

# include 
# include 
# include 
# include 
# include 
# include 
# include 
# include 
#include 
using namespace std;
map > content;
map freq;
size_t ChineseLength;
void initial(){
	string testch="测试";
//	cout<::iterator it = freq.find(value);
			if (it == freq.end()) {
				frequence = 1;
			} else {
				frequence = freq[value];
			}
		} else {
			frequence = 1;
		}
	}
	string getValue() {
		return value;
	}
	int getLength() {
		return value.length();
	}
	int getFrequency() {
		return frequence;
	}
};
struct Chunk {
public:
	vector list;

	Chunk() {

	}
	/**
	 * add a word to the list
	 */
	void addWord(Word w) {
		list.push_back(w);
	}
	int getWordNumber() {
		return list.size();
	}
	double getVariance() {
		double avgLen = 0.0;
		int listSize = list.size();
		for (int i = 0; i < listSize; i++) {
			avgLen += list.at(i).getLength();
		}
		avgLen = 1.0 * avgLen / listSize;
		double variance = 1.0;
		for (int i = 0; i < listSize; i++) {
			double temp = (avgLen - list.at(i).getLength());
			variance += temp * temp;
		}
		return variance;
	}
	long getFreq() {
		long freqValue = 0;
		int listSize = list.size();
		for (int i = 0; i < listSize; i++) {
			freqValue += list.at(i).getFrequency();
		}
		return freqValue;
	}
	vector getVectorString() {
		vector res;
		int size = list.size();
		for (int i = 0; i < size; i++) {
			res.push_back(list.at(i).getValue());
		}
		return res;
	}
};
void read_terms_from_Lexicon() {
	ifstream fin("Lexicon_full_words.txt");
//	string test="一·二八";
//	cout< >::iterator lexiconIterator;
	s.clear();
	//ignore the first one string, I don't know why
	fin >> s;
	cout << "start read from Lexicon_full_words.txt" << endl;
	cout << "reading terms: -> " << endl;

	while (fin >> s) {
//		int len = s.length();
		set tempSet;
		//one character will cost 3 bytes in linux, 2 in windows....maybe not right.you need to test it
		temp.clear();
		temp = s.substr(0, ChineseLength);
		lexiconIterator = content.find(temp);
		if (lexiconIterator == content.end()) {
			tempSet.clear();
			tempSet.insert(s);
		} else {
			tempSet = content[temp];
			tempSet.insert(s);
		}
		content[temp] = tempSet;

	}
	cout << "finish read the lexicon." << endl;
	cout << "finish read the lexicon .lexicon size:" << content.size() << endl;

	fin.close();
}
void show_Lexicon() {
	map >::iterator lexiconIterator;
	int count = 0;
	for (lexiconIterator = content.begin(); lexiconIterator != content.end();
			lexiconIterator++) {
		string first = lexiconIterator->first;
		set second = lexiconIterator->second;
		cout << first << ":";
		set::iterator setIt;
		for (setIt = second.begin(); setIt != second.end(); setIt++) {
			cout << *setIt << " ";
		}
		cout << endl;
		count++;
		if (count == 10) {
			break;
		}
	}
	cout << "lexicon size:" << content.size() << endl;
}
void write_index() {
	fstream outputFile("lexicon.index", fstream::out);
	map >::iterator lexiconIterator;
	for (lexiconIterator = content.begin(); lexiconIterator != content.end();
			lexiconIterator++) {
		string first = lexiconIterator->first;
		set second = lexiconIterator->second;
		outputFile << "#:" << first << endl;
		set::iterator setIt;
		for (setIt = second.begin(); setIt != second.end(); setIt++) {
			outputFile << *setIt << " ";
		}
		outputFile << endl;
	}
	cout << "write index: lexicon size:" << content.size() << endl;
	outputFile.close();
}
void write_freq() {
	fstream outputFile("freq.index", fstream::out);
	map::iterator freqIterator;
	for (freqIterator = freq.begin(); freqIterator != freq.end();
			freqIterator++) {
		string first = freqIterator->first;
		int second = freqIterator->second;
		outputFile << first << " " << second;

		outputFile << endl;
	}
	cout << "write index: freq size:" << freq.size() << endl;
	outputFile.close();

}
void build_freq() {
	ifstream fin("freq.index");
	string tempStr;
	int tempFreq;
	while (fin >> tempStr) {
		fin >> tempFreq;
		if (tempFreq > 1 || tempStr.length() == ChineseLength) {
			freq[tempStr] = tempFreq;
		}
	}
	fin.close();
//	write_freq();
	cout << "index: freq size:" << freq.size() << endl;
}

void build_index() {
	ifstream fin("lexicon.index");
//	cout< tempSet;
	string key = "";
	while (fin >> tempStr) {
		if (tempStr.find("#:", 0) == 0) {
			if (key != "")
				content[key] = tempSet;
			tempSet.clear();
			key = tempStr.substr(2);
		} else {
			tempSet.insert(tempStr);
		}
//		if(content.size()>10){
//			break;
//		}
	}
	content[key] = tempSet;
	cout << "lexicon size:" << content.size() << endl;
	fin.close();
	build_freq();
}

vector chunklist;
int minChunkWordNumber;
void mmseg_recursion(string src, Chunk tempChunk) {
	set termslist;
	int i = 0;
	int len = src.length();
	//get the single character.
	string singleWordStr;
	Word tempWord;
	singleWordStr = src.substr(0, ChineseLength);
	tempWord.setValue(singleWordStr);

	//if the character is the end character , return
	if ((i + ChineseLength) >= len) {
		tempChunk.addWord(tempWord);
		chunklist.push_back(tempChunk);
		if (minChunkWordNumber > tempChunk.getWordNumber()) {
			minChunkWordNumber = tempChunk.getWordNumber();
		}
		return;
	} else {
		string tempStr;
		map >::iterator lexIt;
		lexIt = content.find(singleWordStr);
		//if the lexicon has no this word
		//them let it be a single term
		if (lexIt == content.end()) {
			tempChunk.addWord(tempWord);
			string remain = src.substr(ChineseLength);
			mmseg_recursion(remain, tempChunk);
			tempChunk.list.pop_back();
		} else {
			termslist = content[singleWordStr];
			set::iterator setIt;
			vector termsVector;
//			for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
//				termsVector.push_back(*setIt);
//			}
//			sort(termsVector.begin(), termsVector.end());
//			int sizeVec = termsVector.size();
			for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
				tempStr = *setIt;
				size_t foundit = src.find(tempStr, 0);
				if (foundit == 0 && tempStr != singleWordStr) {
					tempWord.setValue(tempStr);
					tempChunk.addWord(tempWord);
					if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
						tempChunk.list.pop_back();
						return;
					}

					//if the term has all remain character of string
					//return
					if (tempStr.length() == src.length()) {
						chunklist.push_back(tempChunk);
						if (minChunkWordNumber > tempChunk.getWordNumber()) {
							minChunkWordNumber = tempChunk.getWordNumber();
						}
						return;
					}
					string remain = src.substr(tempStr.length());
					mmseg_recursion(remain, tempChunk);
					tempChunk.list.pop_back();
				}
			}
			//process the single character situation
			tempStr = singleWordStr;
			tempWord.setValue(tempStr);
			tempChunk.addWord(tempWord);
			if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
				tempChunk.list.pop_back();
				return;
			}
			string remain = src.substr(tempStr.length());
			mmseg_recursion(remain, tempChunk);
			tempChunk.list.pop_back();

		}
	}
}

vector mmseg(string src) {
	vector res;
	chunklist.clear();
	minChunkWordNumber = 0x7ffffff0;
	Chunk tempChunk;
	vector indexInChunkList;
	int min = 0x7fffffff;
//	cout << min;
	mmseg_recursion(src, tempChunk);
	int chunkListSize = chunklist.size();
	if (chunkListSize == 1) {
		return chunklist.at(0).getVectorString();
	} else {
		for (int i = 0; i < chunkListSize; i++) {
			if (chunklist.at(i).getWordNumber() < min) {
				min = chunklist.at(i).getWordNumber();
				indexInChunkList.clear();
				indexInChunkList.push_back(i);
			} else if (chunklist.at(i).getWordNumber() == min) {
				indexInChunkList.push_back(i);
			}
		}
		//rule 1 to find the max average length chunk
		if (indexInChunkList.size() == 1) {
			return chunklist.at(indexInChunkList.at(0)).getVectorString();
		} else {
			//rule 2 find the least variance of chunk
			double minVariance = min * src.length() * src.length();
			vector tempIndex = indexInChunkList;
			indexInChunkList.clear();
			for (size_t index = 0; index < tempIndex.size(); index++) {
				int i = tempIndex.at(index);
				if (chunklist.at(i).getVariance() < minVariance) {
					minVariance = chunklist.at(i).getVariance();
					indexInChunkList.clear();
					indexInChunkList.push_back(i);
				} else if (chunklist.at(i).getVariance() == minVariance) {
					indexInChunkList.push_back(i);
				}
			}

			if (indexInChunkList.size() == 1) {
				return chunklist.at(indexInChunkList.at(0)).getVectorString();
			} else {
				//rule 3 have most frequency terms
				vector tempIndex = indexInChunkList;
				indexInChunkList.clear();
				long max = 0;
				int tempIndexSize = tempIndex.size();
				for (int index = 0; index < tempIndexSize; index++) {
					int i = tempIndex.at(index);
					if (chunklist.at(i).getFreq() > max) {
						max = chunklist.at(i).getFreq();
						indexInChunkList.clear();
						indexInChunkList.push_back(i);
					} else if (chunklist.at(i).getFreq() == max) {
						indexInChunkList.push_back(i);
					}
				}
				return chunklist.at(indexInChunkList.at(0)).getVectorString();
			}
		}
	}
}
void showTermsSegment(vector src) {
	cout << "segment like this:";
	int size = src.size();
	for (int i = 0; i < size; i++) {
		cout << src.at(i) << " ";
	}
	cout << endl;
}
int main() {
	initial();
//	read_terms_from_Lexicon();
//	write_index();
	build_index();
//	show_Lexicon();
	string test = "中华人民共和国在1949年建立";
	test = "从此开始了新中国的伟大篇章";
	test = "研究生命起源";
	test = "北京天安门";
//	从此开始了新中国的伟大篇章中华人民共和国在一九四九年建立
	test = "主要是因为研究生命起源北京天安门";
	test = "从此开始了新中国的伟大篇章中华人民共和国在一九五五年建立主要是因为研究生命起源北京天安门";

//	test ="国际化企业中华人民共和国";
//	size_t found;
//	found = test.find("开始", 10);
//	bool flag = (found != string::npos);
//	cout << test.substr(test.length(), 3) << endl;
//	cout << test.substr(24, 4) << endl;
//	test = ",";
//	cout << test.length();

//	vector res = mmseg(test);
//	int min = 0x7fffffff;
//	cout << min;
	vector seg = mmseg(test);
//	seg.push_back("从");
//	sort(seg.begin(), seg.end());
	cout << endl;
	cout << "test string :" << test << endl;
	showTermsSegment(seg);

	test = "主要是因为研究生死";
	seg = mmseg(test);
	cout << endl;
	cout << "test string :" << test << endl;
	showTermsSegment(seg);

	return 0;
}




你可能感兴趣的:(C/C++,c++,iterator,string,recursion,character,include)