CleverCode最近在研究sphinx使用rt实时索引,总结了一下php调用的过程,并且总结了一下rt分布式架构设计。
安装详解请查看:http://blog.csdn.net/clevercode/article/details/52204124。
vim /usr/local/sphinx2/etc/realtime.conf
index username
{
# 实时索引类型
type = rt
# 索引保存路径,平时都是保存在内存内,数据量超过内存量的时候会保存在文件内,这里随便存了下没放到data目录下
path =/usr/local/sphinx2/var/data/username
# utf-8' default value is
charset_table = 0..9, A..Z->a..z, _, a..z,U+410..U+42F->U+430..U+44F, U+430..U+44F
#对于非字母型数据的长度切割(默认已字符和数字切割,设置1为按没个字母切割)
ngram_len = 1
ngram_chars = U+3000..U+2FA1F
# 全文检索字段声明,这里把实时索引的索引字段都声明出来
rt_field = name
rt_field = spell
rt_field = shortspell
#他属性字段,可以用来查询
rt_attr_uint = isvalid
rt_attr_timestamp = ctime
rt_attr_timestamp = utime
# 内存保存大小限制,超过这个就会保存到硬盘中
rt_mem_limit = 64M
}
indexer
{
max_iops= 40
max_iosize= 1048576
}
searchd
{
listen = 9312
listen = 9306:mysql41
log = /usr/local/sphinx2/var/log/searchd.log
query_log = /usr/local/sphinx2/var/log/query.log
max_children = 1024
pid_file = /usr/local/sphinx2/var/log/searchd.pid
query_log_format = sphinxql
read_timeout = 5
rt_flush_period = 172800
seamless_rotate = 1
# ondisk_dict_default = 1
workers = threads
mva_updates_pool = 1M
max_packet_size = 64M
max_filters = 256
binlog_path = /tmp
binlog_max_log_size = 1024M
read_buffer = 32M
# read_unhinted = 32K
max_batch_queries = 32
subtree_docs_cache = 64M
subtree_hits_cache = 64M
dist_threads = 24
thread_stack = 128K
client_timeout = 300
}
# pkill searchd
# /usr/local/sphinx2/bin/searchd --config /usr/local/sphinx2/etc/realtime.conf
5.1 SphinxRt类的封装。这个类是根据:http://www.sphinxsearch.org/sphinx-realtime-api。提供的简单改版。
_link = mysql_connect($host);
if(!$this->_link)
{
throw new Exception('sphinx 实时索引服务器连接失败!');
}
if($rt !='')
{
$this->rt = $this->_sql['rt'] = $rt;
}
}
catch (Exception $e)
{
$this->error = $e->getMessage();
}
}
/**
+----------------------------------------------------------
* @todo 设置索引表
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function rt($rt)
{
$this->_sql['rt'] = $this->rt = $rt;
return $this;
}
/**
+----------------------------------------------------------
* @todo where 匹配条件.注意:这里一定要主动加上where 关键词 不能出现这样的情况 where 1
* @access public
* @param $where
* @return void
+----------------------------------------------------------
*/
public function where($where)
{
$this->_sql['where'] = $where;
return $this;
}
/**
+----------------------------------------------------------
* @todo limit
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function limit($limit)
{
$this->_sql['limit'] = $limit;
return $this;
}
/**
+----------------------------------------------------------
* @todo option 评分权值设定等
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function option($option)
{
$this->_sql['option'] = $option;
return $option;
}
/**
+----------------------------------------------------------
* @todo field
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function field($field)
{
$this->_sql['field'] = $field;
return $this;
}
/**
+----------------------------------------------------------
* @todo order
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function order($order)
{
$this->_sql['order'] = $order;
return $this;
}
/**
+----------------------------------------------------------
* @todo group
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function group($group,$withGroup)
{
$this->_sql['group'] = $group;
if($group)
{
$this->_sql['withGroup'] = $withGroup;
}
return $this;
}
/**
+----------------------------------------------------------
* @todo 检索数据,并对数据进行排序,过滤,评分设定等
* @access public
* @param param
* @example select * from rt where match('keyword') group by gid WITHIN GROUP ORDER BY @weight DESC
* order by gid desc limit 0,1 option ranker=bm25,max_matches=3,field_weights=(title=10,content=3);
* @return array
+----------------------------------------------------------
*/
public function search()
{
//排序
if($this->_sql['order'] != '')
{
$orderSql = ' ORDER BY '.$this->_sql['order'];
}
//分组聚合
if($this->_sql['group'] !='')
{
$groupSql = ' GROUP BY '.$this->_sql['group'];
//组内排序
if ($this->_sql['withGroup']!='') {
$groupSql .= ' WITHIN GROUP ORDER BY '.$this->_sql['withGroup'];
}
}
//附加选项
if($this->_sql['option'] !='')
{
$optionSql = ' OPTION '.$this->_sql['option'];
}
//数量限制
if($this->_sql['limit']!='')
{
$limitSql = 'limit '.$this->_sql['limit'];
}
//字段
if($this->_sql['field']=='')
{
$field = '*';
}
else
{
$field= $this->_sql['field'];
}
if($this->_sql['where']!='')
{
$where = $this->_sql['where'];
}
else
{
$where ='';
}
$this->queryStr = sprintf("SELECT %s FROM %s %s %s %s %s %s",$field,$this->_sql['rt'],$where,$groupSql,$orderSql,$limitSql,$optionSql);
$rs = $this->query();
if($rs)
{
$resArr = array();
while ($row = mysql_fetch_assoc($rs)) {
$resArr[] = $row;
}
$resArr['meta'] = $this->getMeta();
return $resArr;
}
return false;
}
/**
+----------------------------------------------------------
* @todo 添加索引,注意,这里的添加并未考虑并发操作,可能在sphinx端会出现id冲突
* @access public
* @param mixed $data 插入的数据
* @return bool
+----------------------------------------------------------
*/
public function insert($data,$lastId=0)
{
if(!empty($data))
{
if($lastId===0)
{
$lastId = $this->getLastId();
}
$fields = $values = '';
foreach ($data as $k=>$v) {
$fields .= ','.$k;
$values .= ",'".$v."'";
}
$this->queryStr = "insert into ".$this->_sql['rt']."(id".$fields.") values ($lastId {$values})";
return $this->query();
}
$this->error = '插入数据不能为空';
return false;
}
/**
+----------------------------------------------------------
* @todo 批量插入数据
* @access public
* @param mixed $datas
* @param boolean $asStr 是否使用逗号分隔的方式一次性插入
* @return void
+----------------------------------------------------------
*/
public function insertAll($datas,$asStr=true)
{
if(!empty($datas))
{
$fields = 'id'; //字段
$values =''; //值
$lastId = $this->getLastId();
$i = 0;
foreach ($datas as $k=>$v) {
//一次性插入数据,格式化
if($asStr)
{
$values .=',('.($i+$lastId);
foreach ($v as $kk=>$va) {
//属性字段
if($i==0)
{
$fields .= ','.$kk;
}
$values .= ",'".$va."'";
}
$i++;
$values .= ')';
}
else
{
$this->insert($v,$lastId);
}
}
//批量数据sql格式化
if($asStr)
{
$values = ltrim($values,',');
$this->queryStr = sprintf("insert into {$this->_sql['rt']}(%s) values %s",$fields,$values);
return $this->query();
}
}
else
{
$this->error = '无效数据!';
return false;
}
}
/**
+----------------------------------------------------------
* @todo 更新索引数据
* @access public
* @param mixed $data 要更新的数据
* @param int $id 更新条件id
* @return bool
+----------------------------------------------------------
*/
public function update($data,$id,$insert=true)
{
if(!empty($data) || $id>0)
{
//如果未找到记录且不需要不需要插入的话
if($insert ===false && $this->getById($id) ===false) return true;
foreach ($data as $k=>$v) {
$fields .= ','.$k;
$values .= ",'".$v."'";
}
//若该条数据不存在,直接插入
$this->queryStr = "replace into ".$this->_sql['rt']."(id".$fields.") values ($id{$values})";
return $this->query();
}
$this->error = '无效更新数据!';
return false;
}
/**
+----------------------------------------------------------
* @todo 条件删除索引,如,根据外部id删除
* @access public
* @param $condition
* @return void
+----------------------------------------------------------
*/
public function delBy($condition)
{
$rs = $this->where($condition)->search();
if($rs)
{
foreach ($rs as $v) {
if($v['id']) $idArr[] = $v['id'];
}
$this->delete($idArr);
return true;
}
return false;
}
/**
+----------------------------------------------------------
* @todo 删除索引数据,sphinx暂未提供批量删除的功能,如 in (123,34,565);
* @access public
* @param mixed $id
* @return void
+----------------------------------------------------------
*/
public function delete($id)
{
if(is_array($id) && count($id)>=1)
{
$rs = true;
foreach ($id as $v) {
$this->queryStr = sprintf("delete from %s where id=%d",$this->_sql['rt'],$v);
$rs &= $this->query();
}
}
else
{
$this->queryStr = sprintf("delete from %s where id=%d",$this->_sql['rt'],$id);
$rs = $this->query();
}
return $rs;
}
/**
+----------------------------------------------------------
* @todo 清空表
* @access public
* @return bool
+----------------------------------------------------------
*/
public function truncate()
{
$lastId = $this->getLastId();
for ($i=1;$i<=$lastId;$i++)
{
$this->delete($i);
}
return true;
}
/**
+----------------------------------------------------------
* @todo 获取总记录
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function countAll()
{
$this->queryStr = "SELECT * FROM $this->_sql['rt'] ";
$this->query();
$meta = $this->getMeta();
if($meta)
{
return $meta['total_found'];
}
return false;
}
/**
+----------------------------------------------------------
* @todo 获取当前最大值id,实现如mysql的auto_increment功能
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function getLastId()
{
$this->queryStr = "select * from {$this->_sql['rt']} order by id desc limit 1";
$rs = $this->query();
//若存在值,则取最大id的值,否则为1
$row = mysql_fetch_assoc($rs);
$lastId = 1;
if($row)
{
$lastId = $row['id']+1;
}
return $lastId?$lastId:1;
}
/**
+----------------------------------------------------------
* @todo 获取查询状态值
* @access protected
* @param param
* @return array();
+----------------------------------------------------------
*/
protected function getMeta()
{
$metaSql = "show meta";
$meta = mysql_query($metaSql);
while ($row = mysql_fetch_assoc($meta)) {
$metaArr[$row['Variable_name']] = $row['Value'];
}
return $metaArr;
}
/**
+----------------------------------------------------------
* @todo 根据id获取记录
* @access public
* @param int $id
* @return array
+----------------------------------------------------------
*/
public function getById($id)
{
if($id>0)
{
$sql = "'select * from $this->rt where id=".$id;
$rs = mysql_query($sql);
$row = mysql_fetch_assoc($rs);
return $row;
}
return false;
}
/**
+----------------------------------------------------------
* @todo 获取索引的字段值,前提条件是索引服务器中必须至少一个值,暂时没有api显示可以直接像mysql 的语句 desc table 来获取索引的字段;
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function _getField($rt)
{
$rt = $rt?$rt:$this->rt;
$this->queryStr = "select * from {$rt} limit 1";
$res = $this->query();
if($res)
{
$row = mysql_fetch_assoc($res);
$field = array_keys($row);
unset($field[1]); //去掉weight,这个字段是sphinx的权重值
return $field;
}
else
{
$this->error = '实时索引'.$rt.'没有任何记录,无法获取索引字段';
return false;
}
}
/**
+----------------------------------------------------------
* @todo mysql查询
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function query($sql = '')
{
if($sql == '')
{
$sql = $this->queryStr;
}
if(!$this->_link) $this->triggerDebug($this->debug);
$rs = mysql_query($sql,$this->_link);
if(!$rs) $this->error = mysql_error();
$this->triggerDebug($this->debug);
return $rs;
}
/**
+----------------------------------------------------------
* @todo 获取错误信息
* @access public
* @return string
+----------------------------------------------------------
*/
public function getError()
{
return $this->error;
}
/**
+----------------------------------------------------------
* @todo 获取最后的sql语句
* @access public
* @param param
* @return string
+----------------------------------------------------------
*/
public function getLastSql()
{
return $this->queryStr;
}
/**
+----------------------------------------------------------
* @todo 触发错误信息
* @access public
* @param param
* @return void
+----------------------------------------------------------
*/
public function triggerDebug($debugMode=false)
{
if($debugMode)
{
$debugInfo = debug_backtrace();
$errorStr = 'file:'.$debugInfo[0]['file'];
$errorStr .= '
line:'.$debugInfo[0]['line'];
$errorStr .= '
sql:'.$debugInfo[0]['object']->queryStr;
$errorStr .= '
error:'.$debugInfo[0]['object']->error.'';
if($debugInfo[0]['object']->error!='')die($errorStr);
echo ($errorStr);
}
return ;
}
}
vim modifySource.php
insert($data);
}
function start()
{
$data = array();
$name = '张三';
$utf8Name = iconv("GBK","UTF-8//IGNORE",$name);
$data['name'] = $utf8Name;
$data['spell'] = 'zhangsan';
$data['shortspell'] = 'zs';
$data['isvalid'] = 1;
$data['ctime'] = '2016-08-17 12:00:00';
$data['utime'] = '2016-08-17 12:00:00';
$ret = insert($data);
print_r($ret);
}
start();
?>
5.2 查询数据
vim search.php
set_charset('utf-8');
//默认词库
$so->add_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb');
//自定义词库
// $so->add_dict('./dd.txt',SCWS_XDICT_TXT);
//默认规则
$so->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini');
//设定分词返回结果时是否去除一些特殊的标点符号
$so->set_ignore(true);
//设定分词返回结果时是否复式分割,如“中国人”返回“中国+人+中国人”三个词。
// 按位异或的 1 | 2 | 4 | 8 分别表示: 短词 | 二元 | 主要单字 | 所有单字
//1,2,4,8 分别对应常量 SCWS_MULTI_SHORT SCWS_MULTI_DUALITY SCWS_MULTI_ZMAIN SCWS_MULTI_ZALL
$so->set_multi(false);
//设定是否将闲散文字自动以二字分词法聚合
$so->set_duality(false);
//设定搜索词
$utf8Key = iconv("GBK","UTF-8//IGNORE",$word);
$so->send_text($utf8Key);
$words_array = $so->get_result();
$so->close();
return $words_array;
}
//查询结果
function search($words)
{
$sc = new SphinxClient();
$sc->SetServer('127.0.0.1',9312);
$sc->SetMatchMode(SPH_MATCH_ALL);
//$sc->SetMatchMode(SPH_MATCH_EXTENDED);
$sc->SetArrayResult(TRUE);
$res = $sc->Query($words);
return $res;
}
function start()
{
$key = '张三';
//分词
$words_array = parseWord($key);
if(false == is_array($words_array) || count($words_array) < 1)
{
echo "words_array is empty!";
return;
}
$words = '';
foreach($words_array as $v)
{
$words = $words.'|('.$v['word'].')';
}
$words = trim($words,'|');
//搜索
$res = search($words);
$str = print_r($res,true);
//打印
echo '输入:'.$key.'
'."\r\n";
echo '分词:'.iconv("UTF-8","GBK//IGNORE",$words).'
'."\r\n";
echo iconv("UTF-8","GBK//IGNORE",$str);
}
start();
?>
输入:张三
分词:(张三)
Array
(
[error] =>
[warning] =>
[status] => 0
[fields] => Array
(
[0] => name
[1] => spell
[2] => shortspell
)
[attrs] => Array
(
[isvalid] => 1
[ctime] => 2
[utime] => 2
)
[matches] => Array
(
[0] => Array
(
[id] => 1
[weight] => 2
[attrs] => Array
(
[isvalid] => 1
[ctime] => 2016
[utime] => 2016
)
)
)
[total] => 1
[total_found] => 1
[time] => 0.001
[words] => Array
(
[张] => Array
(
[docs] => 1
[hits] => 1
)
[三] => Array
(
[docs] => 1
[hits] => 1
)
)
)
当username的索引足够大的时候,以及并发量特别高的时候,可以考虑以下架构设计。
6.1 更新数据源
当需要更新数据源的时候(modifySource)的时候,把需要更新的机器都更新一遍。即192.168.100,92.168.101,92.168.102。
6.2 查询
当需要查询的时候(search),可以根据192.168.100,92.168.101,92.168.102处理能力的权重选择一台机器进行检索。
http://download.csdn.net/download/clevercode/9605832。