scws自定义分词库

CleverCode发现scws分词的效率挺高,研究了一下自定义分词库。

1  安装scws

安装详解:http://blog.csdn.net/clevercode/article/details/52204124。


2 没有添加自定义分词库

2.1 php代码

# vim parseWord.php

set_dict($dictPath);

    //自定义分词库
	$myDictPath = ini_get('scws.default.fpath').'/mydict.xdb';
	if(file_exists($myDictPath))
	{
	    //$cws->add_dict($myDictPath);
	}
	$cws->set_ignore(true);

	$utf8Str = iconv("GBK","UTF-8//IGNORE",$str);
	$cws->send_text($utf8Str);
	$resArr = array();
	while($tmp = $cws->get_result())
	{
		$resArr[] = $tmp;
	}
	$cws->close();
	
	return $resArr;
}

function start()
{
	$key = '里约奥运洪荒之力';
	
	$words_array = parse($key); 
    $str = print_r($words_array,true);
	echo '

输入:'.$key.'

'."\r\n"; echo '

分词:'.iconv("UTF-8","GBK//IGNORE",$str); } start(); ?>



2.2 分词里【里约奥运洪荒之力】结果

输入:里约奥运洪荒之力

分词:Array ( [0] => Array ( [0] => Array ( [word] => 里约 [off] => 0 [len] => 6 [idf] => 15.119999885559 [attr] => ns ) [1] => Array ( [word] => 奥运 [off] => 6 [len] => 6 [idf] => 4.8800001144409 [attr] => n ) [2] => Array ( [word] => 洪荒 [off] => 12 [len] => 6 [idf] => 8.0500001907349 [attr] => n ) [3] => Array ( [word] => 之 [off] => 18 [len] => 3 [idf] => 0 [attr] => r ) [4] => Array ( [word] => 力 [off] => 21 [len] => 3 [idf] => 0 [attr] => n ) ) )


3 添加自定义词库

3.1 生成词库

# vim genMyDict.php

init();

        $this->deleteOldFile();

        $words = $this->getMyWordData();

        $this->write2File($words, $this->myNewDictTxt);

        $this->genMyDict();
    }/*}}}*/

    private function init()
    {/*{{{*/
        $path = ini_get('scws.default.fpath');
        $this->myNewDictTxt = $path.'/myNewDict.txt';
        $this->myNewDictXdb = $path.'/myNewDict.xdb';
        $this->myDictXdb = $path.'/mydict.xdb';
    }/*}}}*/

    //获取我的单词数据
    function getMyWordData()
    {/*{{{*/
        $words = array('里约奥运','洪荒之力');    
        return $words;
    }/*}}}*/

    function deleteOldFile()
    {/*{{{*/
        $this->msgLog('INFO',"清除老文件");
        exec("rm -f $this->myNewDictTxt");
        exec("rm -f $this->myNewDictXdb");
    }/*}}}*/

    //写入数据
    private function write2File(array $words, $path)
    {/*{{{*/

        foreach($words as $word)
        {
            $utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk');
            if(trim($utf8Word) != '')
            {
                $line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", trim($utf8Word), 10.00, 10.00, "n");
                $this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8'));
                file_put_contents($path, $line, FILE_APPEND);
            }
        }
    }/*}}}*/                                                                                    

    //组合字典
    private function genMyDict()
    {/*{{{*/
        $path = ini_get('scws.default.fpath');

        $this->msgLog('INFO',"生成myNewDict.xdb");
        exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb");

        $this->msgLog('INFO',"替换词典");
        exec("mv $this->myNewDictXdb $this->myDictXdb");

        $this->msgLog('INFO',"清除临时文件");
        exec("rm -f $this->myNewDictTxt");
        exec("rm -f $this->myNewDictXdb");
    }/*}}}*/


    /**
     * 打印输出 
     *                                                     
     * @param string $level 级别 INFO/WARNING/ERROR        
     * @param string $logStr 日志信息 
     * @static
     * @access public                                      
     * @return void
     */
    public function msgLog($level,$logStr)
    {/*{{{*/                                               
        if($this->isLogStdOut)                             
        {
            $t = time();
            $logHdr = $t.", [".$level."]: ";               
            $logStr = $logHdr.$logStr."\r\n";              
            echo $logStr;
        }
    }/*}}}*/

}/*}}}*/

function start()
{
   $myDict = new MyDict();
   $myDict->run();
}

start();
?>


生成词典后的结果

scws自定义分词库_第1张图片


3.2 添加自定义词典

去掉parseWord.php,13行注释。$cws->add_dict($myDictPath); 再次执行 php parseWord.php。如下,里约奥运和洪荒之力都被当成了完成的词。

输入:里约奥运洪荒之力

分词:Array ( [0] => Array ( [0] => Array ( [word] => 里约奥运 [off] => 0 [len] => 12 [idf] => 10 [attr] => n ) [1] => Array ( [word] => 洪荒之力 [off] => 12 [len] => 12 [idf] => 10 [attr] => n ) ) )




你可能感兴趣的:(Linux常用软件安装与配置)