一些分词中用到的公式-参考ictclas

//计算平滑系数公式

//平滑参数
dSmoothingPara = 0.1
//设置当前节点的频度,如果是已知词性,直接使用频度
dCurFreqency
//一个参数
static int MAX_FREQUENCE = 2079997;
//Two linked Words frequency
dTemp = (double) 1 / MAX_FREQUENCE;
//两词之间的词频?关联度?
nTwoWordsFreq = DictBinary.GetFrequency(sTwoWords, 3);


//这个词的平度
			if (pCur.p.nPOS >= 0) {
				// It's not an unknown words
				dCurFreqency = pCur.p.value;
			} else {
				// Unknown words
				//如果是未知词性,从核心词典中检索词组汉字对应2的频度
				dCurFreqency = DictCore.GetFrequency(pCur.p.sWord, 2);
			}
			
			
			/**
		 * 得到具体词和词性的频度数据
		 * 
		 * @param sWord
		 *            单词
		 * @param nHandle
		 *            词性
		 * @return 频度
		 */
		public int GetFrequency(char[] sWord, int nHandle) {
			char sWordFind[] = new char[WORD_MAXLENGTH - 2];
			int nPos, nIndex;
			PWORD_CHAIN pFound;
			Pint pnPos = new Pint();
			if (!PreProcessing(sWord, pnPos, sWordFind))
				return 0;
			nPos = pnPos.value;
	
			Pint pnIndex = new Pint();
			if (FindInOriginalTable(nPos, sWordFind, nHandle, pnIndex)) {
				nIndex = pnIndex.value;
				return m_IndexTable[nPos].pWordItemHead[nIndex].p.nFrequency;
			}
			nIndex = pnIndex.value;
	
			PPWORD_CHAIN ppFound = new PPWORD_CHAIN(new PWORD_CHAIN(
					new WORD_CHAIN()));
			if (FindInModifyTable(nPos, sWordFind, nHandle, ppFound)) {
				return ppFound.p.p.data.nFrequency;
			}
			return 0;
		}

dValue = -Math
						.log(dSmoothingPara * (1 + dCurFreqency) / (MAX_FREQUENCE + 80000)+ (1 - dSmoothingPara)* ((1 - dTemp) * nTwoWordsFreq/ (1 + dCurFreqency) + dTemp));

你可能感兴趣的:(Tcl)