CleverCode发现scws分词的效率挺高,研究了一下自定义分词库。
安装详解:http://blog.csdn.net/clevercode/article/details/52204124。
2.1 php代码
# vim parseWord.php
set_dict($dictPath);
//自定义分词库
$myDictPath = ini_get('scws.default.fpath').'/mydict.xdb';
if(file_exists($myDictPath))
{
//$cws->add_dict($myDictPath);
}
$cws->set_ignore(true);
$utf8Str = iconv("GBK","UTF-8//IGNORE",$str);
$cws->send_text($utf8Str);
$resArr = array();
while($tmp = $cws->get_result())
{
$resArr[] = $tmp;
}
$cws->close();
return $resArr;
}
function start()
{
$key = '里约奥运洪荒之力';
$words_array = parse($key);
$str = print_r($words_array,true);
echo '输入:'.$key.'
'."\r\n";
echo '分词:'.iconv("UTF-8","GBK//IGNORE",$str);
}
start();
?>
输入:里约奥运洪荒之力
分词:Array
(
[0] => Array
(
[0] => Array
(
[word] => 里约
[off] => 0
[len] => 6
[idf] => 15.119999885559
[attr] => ns
)
[1] => Array
(
[word] => 奥运
[off] => 6
[len] => 6
[idf] => 4.8800001144409
[attr] => n
)
[2] => Array
(
[word] => 洪荒
[off] => 12
[len] => 6
[idf] => 8.0500001907349
[attr] => n
)
[3] => Array
(
[word] => 之
[off] => 18
[len] => 3
[idf] => 0
[attr] => r
)
[4] => Array
(
[word] => 力
[off] => 21
[len] => 3
[idf] => 0
[attr] => n
)
)
)
3.1 生成词库
# vim genMyDict.php
init();
$this->deleteOldFile();
$words = $this->getMyWordData();
$this->write2File($words, $this->myNewDictTxt);
$this->genMyDict();
}/*}}}*/
private function init()
{/*{{{*/
$path = ini_get('scws.default.fpath');
$this->myNewDictTxt = $path.'/myNewDict.txt';
$this->myNewDictXdb = $path.'/myNewDict.xdb';
$this->myDictXdb = $path.'/mydict.xdb';
}/*}}}*/
//获取我的单词数据
function getMyWordData()
{/*{{{*/
$words = array('里约奥运','洪荒之力');
return $words;
}/*}}}*/
function deleteOldFile()
{/*{{{*/
$this->msgLog('INFO',"清除老文件");
exec("rm -f $this->myNewDictTxt");
exec("rm -f $this->myNewDictXdb");
}/*}}}*/
//写入数据
private function write2File(array $words, $path)
{/*{{{*/
foreach($words as $word)
{
$utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk');
if(trim($utf8Word) != '')
{
$line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", trim($utf8Word), 10.00, 10.00, "n");
$this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8'));
file_put_contents($path, $line, FILE_APPEND);
}
}
}/*}}}*/
//组合字典
private function genMyDict()
{/*{{{*/
$path = ini_get('scws.default.fpath');
$this->msgLog('INFO',"生成myNewDict.xdb");
exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb");
$this->msgLog('INFO',"替换词典");
exec("mv $this->myNewDictXdb $this->myDictXdb");
$this->msgLog('INFO',"清除临时文件");
exec("rm -f $this->myNewDictTxt");
exec("rm -f $this->myNewDictXdb");
}/*}}}*/
/**
* 打印输出
*
* @param string $level 级别 INFO/WARNING/ERROR
* @param string $logStr 日志信息
* @static
* @access public
* @return void
*/
public function msgLog($level,$logStr)
{/*{{{*/
if($this->isLogStdOut)
{
$t = time();
$logHdr = $t.", [".$level."]: ";
$logStr = $logHdr.$logStr."\r\n";
echo $logStr;
}
}/*}}}*/
}/*}}}*/
function start()
{
$myDict = new MyDict();
$myDict->run();
}
start();
?>
生成词典后的结果
去掉parseWord.php,13行注释。$cws->add_dict($myDictPath); 再次执行 php parseWord.php。如下,里约奥运和洪荒之力都被当成了完成的词。
输入:里约奥运洪荒之力
分词:Array
(
[0] => Array
(
[0] => Array
(
[word] => 里约奥运
[off] => 0
[len] => 12
[idf] => 10
[attr] => n
)
[1] => Array
(
[word] => 洪荒之力
[off] => 12
[len] => 12
[idf] => 10
[attr] => n
)
)
)