CleverCode发现scws分词的效率挺高,研究了一下自定义分词库。
安装详解:http://blog.csdn.net/clevercode/article/details/52204124。
2.1 php代码
# vim parseWord.php
<?php function parse($str) { $cws = scws_new(); $dictPath = ini_get('scws.default.fpath').'/dict.utf8.xdb'; $cws->set_dict($dictPath); //自定义分词库 $myDictPath = ini_get('scws.default.fpath').'/mydict.xdb'; if(file_exists($myDictPath)) { //$cws->add_dict($myDictPath); } $cws->set_ignore(true); $utf8Str = iconv("GBK","UTF-8//IGNORE",$str); $cws->send_text($utf8Str); $resArr = array(); while($tmp = $cws->get_result()) { $resArr[] = $tmp; } $cws->close(); return $resArr; } function start() { $key = '里约奥运洪荒之力'; $words_array = parse($key); $str = print_r($words_array,true); echo '<p>输入:'.$key.'</p>'."\r\n"; echo '<p>分词:'.iconv("UTF-8","GBK//IGNORE",$str); } start(); ?>
<p>输入:里约奥运洪荒之力</p> <p>分词:Array ( [0] => Array ( [0] => Array ( [word] => 里约 [off] => 0 [len] => 6 [idf] => 15.119999885559 [attr] => ns ) [1] => Array ( [word] => 奥运 [off] => 6 [len] => 6 [idf] => 4.8800001144409 [attr] => n ) [2] => Array ( [word] => 洪荒 [off] => 12 [len] => 6 [idf] => 8.0500001907349 [attr] => n ) [3] => Array ( [word] => 之 [off] => 18 [len] => 3 [idf] => 0 [attr] => r ) [4] => Array ( [word] => 力 [off] => 21 [len] => 3 [idf] => 0 [attr] => n ) ) )
3.1 生成词库
# vim genMyDict.php
<?php /** * 生成我的字典 */ Class MyDict {/*{{{*/ //是否输出日志 private $isLogStdOut = true; //我的字典txt文件 private $myNewDictTxt; //我的字典xdb文件 private $myNewDictXdb; private $myDictXdb; function run() {/*{{{*/ $this->init(); $this->deleteOldFile(); $words = $this->getMyWordData(); $this->write2File($words, $this->myNewDictTxt); $this->genMyDict(); }/*}}}*/ private function init() {/*{{{*/ $path = ini_get('scws.default.fpath'); $this->myNewDictTxt = $path.'/myNewDict.txt'; $this->myNewDictXdb = $path.'/myNewDict.xdb'; $this->myDictXdb = $path.'/mydict.xdb'; }/*}}}*/ //获取我的单词数据 function getMyWordData() {/*{{{*/ $words = array('里约奥运','洪荒之力'); return $words; }/*}}}*/ function deleteOldFile() {/*{{{*/ $this->msgLog('INFO',"清除老文件"); exec("rm -f $this->myNewDictTxt"); exec("rm -f $this->myNewDictXdb"); }/*}}}*/ //写入数据 private function write2File(array $words, $path) {/*{{{*/ foreach($words as $word) { $utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk'); if(trim($utf8Word) != '') { $line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", trim($utf8Word), 10.00, 10.00, "n"); $this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8')); file_put_contents($path, $line, FILE_APPEND); } } }/*}}}*/ //组合字典 private function genMyDict() {/*{{{*/ $path = ini_get('scws.default.fpath'); $this->msgLog('INFO',"生成myNewDict.xdb"); exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb"); $this->msgLog('INFO',"替换词典"); exec("mv $this->myNewDictXdb $this->myDictXdb"); $this->msgLog('INFO',"清除临时文件"); exec("rm -f $this->myNewDictTxt"); exec("rm -f $this->myNewDictXdb"); }/*}}}*/ /** * 打印输出 * * @param string $level 级别 INFO/WARNING/ERROR * @param string $logStr 日志信息 * @static * @access public * @return void */ public function msgLog($level,$logStr) {/*{{{*/ if($this->isLogStdOut) { $t = time(); $logHdr = $t.", [".$level."]: "; $logStr = $logHdr.$logStr."\r\n"; echo $logStr; } }/*}}}*/ }/*}}}*/ function start() { $myDict = new MyDict(); $myDict->run(); } start(); ?>
生成词典后的结果
去掉parseWord.php,13行注释。$cws->add_dict($myDictPath); 再次执行 php parseWord.php。如下,里约奥运和洪荒之力都被当成了完成的词。
<p>输入:里约奥运洪荒之力</p> <p>分词:Array ( [0] => Array ( [0] => Array ( [word] => 里约奥运 [off] => 0 [len] => 12 [idf] => 10 [attr] => n ) [1] => Array ( [word] => 洪荒之力 [off] => 12 [len] => 12 [idf] => 10 [attr] => n ) ) )