coreseek是基于sphinx全文检索引擎的,与sphinx相比就是支持中文分词。
在搜索这块有挺不错的性能,亲测50多万的数据,用coreseek的搜索方案比原始mysql的like查询性能提高10倍左右。
一、安装(前提安装好LAMP)
1、下载解压coreseek
tar -zxvf coreseek-3.2.14.tar.gz
2、先安装mmseg中文分词
cd coreseek-3.2.14/mmesg-3.2.14
./configure --prefix=/usr/local/mmseg
(如果报错误error: cannot find input file: src/Makefile.in
运行
aclocal
libtoolize --force
automake --add-missing
autoconf
autoheader
make clean
再configure && make && make install )
3、安装coreseek
cd coreseek-3.2.14/csft-3.2.14/
./configure --prefix=/usr/local/coreseek --with-mysql=/usr/local/mysql --with-mmseg=/usr/local/mmseg --with-mmseg-includes=/usr/local/mmseg/include/mmseg/ --with-mmseg-libs=/usr/local/mmseg/lib/
make && make install
二、配置测试coreseek
4、配置coreseek
cd /usr/local/coreseek/etc
cp sphinx.conf.dist csft.conf
vi csft.conf
csft.conf的结构为
source src1{ } 主数据源
source src1throttled : src1{ } 增量数据源
index test1{ } 主数据索引
index test1stemmed : test1{ } 增量数据索引
index dist1{ } 分布式索引
indexer 索引器
searchd 服务进程
4.1 在主数据源source src1{}里
更改数据库主机,用户名,密码,sql_sock
更改
# sql_query_pre = SET NAMES utf8
# sql_query_pre = SET SESSION query_cache_type=OFF
为
# sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type=OFF
更改
sql_query = \
SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \
FROM documents
为
sql_query = select id,title,content from post;
更改
sql_attr_uint = group_id
为
#sql_attr_uint = group_id
更改
sql_attr_timestamp = date_added
为
#sql_attr_timestamp = date_added
更改
sql_query_info = SELECT * FROM document WHERE id=$id
为
sql_query_info = SELECT * FROM post WHERE id=$id
4.2 在增量数据源source src1throttled : src1{ }里
注释整个source src1throttled : src1{ },包括source src1throttled : src1{ }
4.3 在主数据索引index test1{ }里
更改
index test1{ }
为
index src1{ }
更改
source = test1
为
source = src1
更改
path = /usr/local/sphinx/var/data/test1
为
path = /usr/local/sphinx/var/data/src1
更改
stopwords = G:\data\stopwords.txt
为
#stopwords = G:\data\stopwords.txt
更改
wordforms = G:\data\wordforms.txt
为
#wordforms = G:\data\wordforms.txt
更改
exceptions = /data/exceptions.txt
为
#exceptions = /data/exceptions.txt
更改
charset_type = sbcs
为
charset_type = zh_cn.utf-8
charset_dictpath = /usr/local/mmseg/etc/
4.4 注释全部 index test1stemmed : test1{ } 增量数据索引
4.5 注释全部 index dist1{ } 分布式索引
4.6 在索引器indexer里
更改
mem_limit = 32M
为
mem_limit = 128M
4.7 在服务进程searchd里不需更改
或者参照
</pre><pre name="code" class="html"># # Sphinx configuration file sample # # WARNING! While this sample file mentions all available options, # it contains (very) short helper descriptions only. Please refer to # doc/sphinx.html for details. # ############################################################################# ## data source definition ############################################################################# source src1 { # data source type. mandatory, no default value # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc type = mysql ##################################################################### ## SQL settings (for 'mysql' and 'pgsql' types) ##################################################################### # some straightforward parameters for SQL source types sql_host = localhost sql_user = root sql_pass = 111111 sql_db = mraz sql_port = 3306 # optional, default is 3306 # UNIX socket name # optional, default is empty (reuse client library defaults) # usually '/var/lib/mysql/mysql.sock' on Linux # usually '/tmp/mysql.sock' on FreeBSD # # sql_sock = /tmp/mysql.sock # MySQL specific client connection flags # optional, default is 0 # # mysql_connect_flags = 32 # enable compression # MySQL specific SSL certificate settings # optional, defaults are empty # # mysql_ssl_cert = /etc/ssl/client-cert.pem # mysql_ssl_key = /etc/ssl/client-key.pem # mysql_ssl_ca = /etc/ssl/cacert.pem # MS SQL specific Windows authentication mode flag # MUST be in sync with charset_type index-level setting # optional, default is 0 # # mssql_winauth = 1 # use currently logged on user credentials # MS SQL specific Unicode indexing flag # optional, default is 0 (request SBCS data) # # mssql_unicode = 1 # request Unicode data from server # ODBC specific DSN (data source name) # mandatory for odbc source type, no default value # # odbc_dsn = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)}; # sql_query = SELECT id, data FROM documents.csv # pre-query, executed before the main fetch query # multi-value, optional, default is empty list of queries # # sql_query_pre = SET NAMES utf8 sql_query_pre = SET SESSION query_cache_type=OFF # main document fetch query # mandatory, integer document ID field MUST be the first selected column #sql_query = \ # SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \ # FROM documents sql_query = select id,title,content from tbContent; # range query setup, query that must return min and max ID values # optional, default is empty # # sql_query will need to reference $start and $end boundaries # if using ranged query: # # sql_query = \ # SELECT doc.id, doc.id AS group, doc.title, doc.data \ # FROM documents doc \ # WHERE id>=$start AND id<=$end # # sql_query_range = SELECT MIN(id),MAX(id) FROM documents # range query step # optional, default is 1024 # # sql_range_step = 1000 # unsigned integer attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # optional bit size can be specified, default is 32 # # sql_attr_uint = author_id # sql_attr_uint = forum_id:9 # 9 bits for forum_id #sql_attr_uint = group_id # boolean attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # equivalent to sql_attr_uint with 1-bit size # # sql_attr_bool = is_deleted # bigint attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # declares a signed (unlike uint!) 64-bit attribute # # sql_attr_bigint = my_bigint_id # UNIX timestamp attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # similar to integer, but can also be used in date functions # # sql_attr_timestamp = posted_ts # sql_attr_timestamp = last_edited_ts #sql_attr_timestamp = date_added # string ordinal attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # sorts strings (bytewise), and stores their indexes in the sorted list # sorting by this attr is equivalent to sorting by the original strings # # sql_attr_str2ordinal = author_name # floating point attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # values are stored in single precision, 32-bit IEEE 754 format # # sql_attr_float = lat_radians # sql_attr_float = long_radians # multi-valued attribute (MVA) attribute declaration # multi-value (an arbitrary number of attributes is allowed), optional # MVA values are variable length lists of unsigned 32-bit integers # # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY] # ATTR-TYPE is 'uint' or 'timestamp' # SOURCE-TYPE is 'field', 'query', or 'ranged-query' # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range' # # sql_attr_multi = uint tag from query; SELECT id, tag FROM tags # sql_attr_multi = uint tag from ranged-query; \ # SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \ # SELECT MIN(id), MAX(id) FROM tags # post-query, executed on sql_query completion # optional, default is empty # # sql_query_post = # post-index-query, executed on successful indexing completion # optional, default is empty # $maxid expands to max document ID actually fetched from DB # # sql_query_post_index = REPLACE INTO counters ( id, val ) \ # VALUES ( 'max_indexed_id', $maxid ) # ranged query throttling, in milliseconds # optional, default is 0 which means no delay # enforces given delay before each query step sql_ranged_throttle = 0 # document info query, ONLY for CLI search (ie. testing and debugging) # optional, default is empty # must contain $id macro and must fetch the document by that id #sql_query_info = SELECT * FROM documents WHERE id=$id sql_query_info = SELECT * FROM tbContent WHERE id=$id # kill-list query, fetches the document IDs for kill-list # k-list will suppress matches from preceding indexes in the same query # optional, default is empty # # sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex # columns to unpack on indexer side when indexing # multi-value, optional, default is empty list # # unpack_zlib = zlib_column # unpack_mysqlcompress = compressed_column # unpack_mysqlcompress = compressed_column_2 # maximum unpacked length allowed in MySQL COMPRESS() unpacker # optional, default is 16M # # unpack_mysqlcompress_maxsize = 16M ##################################################################### ## xmlpipe settings ##################################################################### # type = xmlpipe # shell command to invoke xmlpipe stream producer # mandatory # # xmlpipe_command = cat /usr/local/coreseek/var/test.xml ##################################################################### ## xmlpipe2 settings ##################################################################### # type = xmlpipe2 # xmlpipe_command = cat /usr/local/coreseek/var/test2.xml # xmlpipe2 field declaration # multi-value, optional, default is empty # # xmlpipe_field = subject # xmlpipe_field = content # xmlpipe2 attribute declaration # multi-value, optional, default is empty # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX # # xmlpipe_attr_timestamp = published # xmlpipe_attr_uint = author_id # perform UTF-8 validation, and filter out incorrect codes # avoids XML parser choking on non-UTF-8 documents # optional, default is 0 # # xmlpipe_fixup_utf8 = 1 } # inherited source example # # all the parameters are copied from the parent source, # and may then be overridden in this source definition #source src1throttled : src1 #{ # sql_ranged_throttle = 100 #} ############################################################################# ## index definition ############################################################################# # local index example # # this is an index which is stored locally in the filesystem # # all indexing-time options (such as morphology and charsets) # are configured per local index index src1 { # document source(s) to index # multi-value, mandatory # document IDs must be globally unique across all sources source = src1 # index files path and file name, without extension # mandatory, path must be writable, extensions will be auto-appended path = /usr/local/coreseek/var/data/src1 # document attribute values (docinfo) storage mode # optional, default is 'extern' # known values are 'none', 'extern' and 'inline' docinfo = extern # memory locking for cached data (.spa and .spi), to prevent swapping # optional, default is 0 (do not mlock) # requires searchd to be run from root mlock = 0 # a list of morphology preprocessors to apply # optional, default is empty # # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru', # 'soundex', and 'metaphone'; additional preprocessors available from # libstemmer are 'libstemmer_XXX', where XXX is algorithm code # (see libstemmer_c/libstemmer/modules.txt) # # morphology = stem_en, stem_ru, soundex # morphology = libstemmer_german # morphology = libstemmer_sv morphology = none # minimum word length at which to enable stemming # optional, default is 1 (stem everything) # # min_stemming_len = 1 # stopword files list (space separated) # optional, default is empty # contents are plain text, charset_table and stemming are both applied # #stopwords = G:\data\stopwords.txt # wordforms file, in "mapfrom > mapto" plain text format # optional, default is empty # #wordforms = G:\data\wordforms.txt # tokenizing exceptions file # optional, default is empty # # plain text, case sensitive, space insensitive in map-from part # one "Map Several Words => ToASingleOne" entry per line # #exceptions = /data/exceptions.txt # minimum indexed word length # default is 1 (index everything) min_word_len = 1 # charset encoding type # optional, default is 'sbcs' # known types are 'sbcs' (Single Byte CharSet) and 'utf-8' charset_type = zh_cn.utf-8 charset_dictpath = /usr/local/mmseg/etc/ # charset definition and case folding rules "table" # optional, default value depends on charset_type # # defaults are configured to include English and Russian characters only # you need to change the table to include additional ones # this behavior MAY change in future versions # # 'sbcs' default value is # charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF # # 'utf-8' default value is # charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F # ignored characters list # optional, default value is empty # # ignore_chars = U+00AD # minimum word prefix length to index # optional, default is 0 (do not index prefixes) # # min_prefix_len = 0 # minimum word infix length to index # optional, default is 0 (do not index infixes) # # min_infix_len = 0 # list of fields to limit prefix/infix indexing to # optional, default value is empty (index all fields in prefix/infix mode) # # prefix_fields = filename # infix_fields = url, domain # enable star-syntax (wildcards) when searching prefix/infix indexes # known values are 0 and 1 # optional, default is 0 (do not use wildcard syntax) # # enable_star = 1 # n-gram length to index, for CJK indexing # only supports 0 and 1 for now, other lengths to be implemented # optional, default is 0 (disable n-grams) # # ngram_len = 1 # n-gram characters list, for CJK indexing # optional, default is empty # # ngram_chars = U+3000..U+2FA1F # phrase boundary characters list # optional, default is empty # # phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis # phrase boundary word position increment # optional, default is 0 # # phrase_boundary_step = 100 # whether to strip HTML tags from incoming documents # known values are 0 (do not strip) and 1 (do strip) # optional, default is 0 html_strip = 0 # what HTML attributes to index if stripping HTML # optional, default is empty (do not index anything) # # html_index_attrs = img=alt,title; a=title; # what HTML elements contents to strip # optional, default is empty (do not strip element contents) # # html_remove_elements = style, script # whether to preopen index data files on startup # optional, default is 0 (do not preopen), searchd-only # # preopen = 1 # whether to keep dictionary (.spi) on disk, or cache it in RAM # optional, default is 0 (cache in RAM), searchd-only # # ondisk_dict = 1 # whether to enable in-place inversion (2x less disk, 90-95% speed) # optional, default is 0 (use separate temporary files), indexer-only # # inplace_enable = 1 # in-place fine-tuning options # optional, defaults are listed below # # inplace_hit_gap = 0 # preallocated hitlist gap size # inplace_docinfo_gap = 0 # preallocated docinfo gap size # inplace_reloc_factor = 0.1 # relocation buffer size within arena # inplace_write_factor = 0.1 # write buffer size within arena # whether to index original keywords along with stemmed versions # enables "=exactform" operator to work # optional, default is 0 # # index_exact_words = 1 # position increment on overshort (less that min_word_len) words # optional, allowed values are 0 and 1, default is 1 # # overshort_step = 1 # position increment on stopword # optional, allowed values are 0 and 1, default is 1 # # stopword_step = 1 } # inherited index example # # all the parameters are copied from the parent index, # and may then be overridden in this index definition #index test1stemmed : test1 #{ # path = /usr/local/coreseek/var/data/test1stemmed # morphology = stem_en #} # distributed index example # # this is a virtual index which can NOT be directly indexed, # and only contains references to other local and/or remote indexes #index dist1 #{ # 'distributed' index type MUST be specified # type = distributed # local index to be searched # there can be many local indexes configured # local = test1 # local = test1stemmed # remote agent # multiple remote agents may be specified # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]' # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]' # agent = localhost:9313:remote1 # agent = localhost:9314:remote2,remote3 # agent = /var/run/searchd.sock:remote4 # blackhole remote agent, for debugging/testing # network errors and search results will be ignored # # agent_blackhole = testbox:9312:testindex1,testindex2 # remote agent connection timeout, milliseconds # optional, default is 1000 ms, ie. 1 sec # agent_connect_timeout = 1000 # remote agent query timeout, milliseconds # optional, default is 3000 ms, ie. 3 sec # agent_query_timeout = 3000 #} ############################################################################# ## indexer settings ############################################################################# indexer { # memory limit, in bytes, kiloytes (16384K) or megabytes (256M) # optional, default is 32M, max is 2047M, recommended is 256M to 1024M mem_limit = 128M # maximum IO calls per second (for I/O throttling) # optional, default is 0 (unlimited) # # max_iops = 40 # maximum IO call size, bytes (for I/O throttling) # optional, default is 0 (unlimited) # # max_iosize = 1048576 # maximum xmlpipe2 field length, bytes # optional, default is 2M # # max_xmlpipe2_field = 4M # write buffer size, bytes # several (currently up to 4) buffers will be allocated # write buffers are allocated in addition to mem_limit # optional, default is 1M # # write_buffer = 1M } ############################################################################# ## searchd settings ############################################################################# searchd { # hostname, port, or hostname:port, or /unix/socket/path to listen on # multi-value, multiple listen points are allowed # optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312) # # listen = 127.0.0.1 # listen = 192.168.0.1:9312 # listen = 9312 # listen = /var/run/searchd.sock # log file, searchd run info is logged here # optional, default is 'searchd.log' log = /usr/local/coreseek/var/log/searchd.log # query log file, all search queries are logged here # optional, default is empty (do not log queries) query_log = /usr/local/coreseek/var/log/query.log # client read timeout, seconds # optional, default is 5 read_timeout = 5 # request timeout, seconds # optional, default is 5 minutes client_timeout = 300 # maximum amount of children to fork (concurrent searches to run) # optional, default is 0 (unlimited) max_children = 30 # PID file, searchd process ID file name # mandatory pid_file = /usr/local/coreseek/var/log/searchd.pid # max amount of matches the daemon ever keeps in RAM, per-index # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL # default is 1000 (just like Google) max_matches = 999999 # seamless rotate, prevents rotate stalls if precaching huge datasets # optional, default is 1 seamless_rotate = 1 # whether to forcibly preopen all indexes on startup # optional, default is 0 (do not preopen) preopen_indexes = 0 # whether to unlink .old index copies on succesful rotation. # optional, default is 1 (do unlink) unlink_old = 1 # attribute updates periodic flush timeout, seconds # updates will be automatically dumped to disk this frequently # optional, default is 0 (disable periodic flush) # # attr_flush_period = 900 # instance-wide ondisk_dict defaults (per-index value take precedence) # optional, default is 0 (precache all dictionaries in RAM) # # ondisk_dict_default = 1 # MVA updates pool size # shared between all instances of searchd, disables attr flushes! # optional, default size is 1M mva_updates_pool = 1M # max allowed network packet size # limits both query packets from clients, and responses from agents # optional, default size is 8M max_packet_size = 8M # crash log path # searchd will (try to) log crashed query to 'crash_log_path.PID' file # optional, default is empty (do not create crash logs) # # crash_log_path = /usr/local/coreseek/var/log/crash # max allowed per-query filter count # optional, default is 256 max_filters = 256 # max allowed per-filter values count # optional, default is 4096 max_filter_values = 4096 # socket listen queue length # optional, default is 5 # # listen_backlog = 5 # per-keyword read buffer size # optional, default is 256K # # read_buffer = 256K # unhinted read size (currently used when reading hits) # optional, default is 32K # # read_unhinted = 32K } # --eof--
insert into tbContent(title,content) values('hello123','php旅游无限好');
insert into tbContent(title,content) values('hello123','我们和你们一起去旅游吧');
insert into tbContent(title,content) select title,content from tbContent;##翻倍新增数据,增至50多万
/usr/local/coreseek/bin/search -c /usr/local/coreseek/etc/csft.conf 欢迎
能搜索处理结果表示配置成功
5、在/usr/local/apache/htdocs下,新建test_with_mysql.php和testCoreseek_with_mysql.php,以便测试用coreseek的搜索方案比原始mysql的like查询性能提高多少
test_with_mysql.php内容如下
<?php $start_time = microtime(); header("content-type:text/html;charset=utf-8;"); $keyword = $_REQUEST['keyword']; $mysql_server_name='localhost'; $mysql_username='root'; $mysql_password='111111'; $mysql_database='mraz'; $conn=mysql_connect($mysql_server_name,$mysql_username,$mysql_password) or die("error connecting") ; mysql_query("set names utf8"); mysql_select_db($mysql_database); $sql ="select * from tbContent where title like '%".$keyword."%' or content like '%".$keyword."%' limit 1000"; //var_dump($sql); $result = mysql_query($sql,$conn); //var_dump($result); if($result){ $end_time = microtime(); var_dump( $end_time - $start_time ); } //while($row = mysql_fetch_array($result)){ // echo $row['title'] . "--" . $row['content']; // echo "<br />"; //} mysql_close($conn);
testCoreseek_with_mysql.php内容如下
<?php $start_time = microtime(); header("content-type:text/html;charset=utf-8;"); require_once("sphinxapi.php"); $sphinx = new SphinxClient(); $sphinx->SetServer("localhost",9312); $sphinx->SetMatchMode(SPH_MATCH_ANY); $keyword = $_REQUEST['keyword']; $result = $sphinx->query($keyword,"*"); $id_arr = array(); $id_str = ''; foreach($result['matches'] as $id => $ret){ $id_arr[] = $id; $id_str .= $id.","; } $id_str .= "0"; //var_dump($id_arr);//根据id找数据表对应内容 $mysql_server_name='localhost'; $mysql_username='root'; $mysql_password='111111'; $mysql_database='mraz'; $conn=mysql_connect($mysql_server_name,$mysql_username,$mysql_password) or die("error connecting") ; mysql_query("set names utf8"); mysql_select_db($mysql_database); $sql ="select * from tbContent where id in (".$id_str.")"; //var_dump($sql); $result = mysql_query($sql,$conn); //var_dump($result); if($result){ $end_time = microtime(); var_dump( $end_time - $start_time ); } //while($row = mysql_fetch_array($result)){ // echo $row['title'] . "--" . $row['content']; // echo "<br />"; //} mysql_close($conn);
<?php // // $Id: sphinxapi.php 2055 2009-11-06 23:09:58Z shodan $ // // // Copyright (c) 2001-2008, Andrew Aksyonoff. All rights reserved. // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License. You should have // received a copy of the GPL license along with this program; if you // did not, you can find it at http://www.gnu.org/ // ///////////////////////////////////////////////////////////////////////////// // PHP version of Sphinx searchd client (PHP API) ///////////////////////////////////////////////////////////////////////////// /// known searchd commands define ( "SEARCHD_COMMAND_SEARCH", 0 ); define ( "SEARCHD_COMMAND_EXCERPT", 1 ); define ( "SEARCHD_COMMAND_UPDATE", 2 ); define ( "SEARCHD_COMMAND_KEYWORDS",3 ); define ( "SEARCHD_COMMAND_PERSIST", 4 ); define ( "SEARCHD_COMMAND_STATUS", 5 ); define ( "SEARCHD_COMMAND_QUERY", 6 ); /// current client-side command implementation versions define ( "VER_COMMAND_SEARCH", 0x116 ); define ( "VER_COMMAND_EXCERPT", 0x100 ); define ( "VER_COMMAND_UPDATE", 0x102 ); define ( "VER_COMMAND_KEYWORDS", 0x100 ); define ( "VER_COMMAND_STATUS", 0x100 ); define ( "VER_COMMAND_QUERY", 0x100 ); /// known searchd status codes define ( "SEARCHD_OK", 0 ); define ( "SEARCHD_ERROR", 1 ); define ( "SEARCHD_RETRY", 2 ); define ( "SEARCHD_WARNING", 3 ); /// known match modes define ( "SPH_MATCH_ALL", 0 ); define ( "SPH_MATCH_ANY", 1 ); define ( "SPH_MATCH_PHRASE", 2 ); define ( "SPH_MATCH_BOOLEAN", 3 ); define ( "SPH_MATCH_EXTENDED", 4 ); define ( "SPH_MATCH_FULLSCAN", 5 ); define ( "SPH_MATCH_EXTENDED2", 6 ); // extended engine V2 (TEMPORARY, WILL BE REMOVED) /// known ranking modes (ext2 only) define ( "SPH_RANK_PROXIMITY_BM25", 0 ); ///< default mode, phrase proximity major factor and BM25 minor one define ( "SPH_RANK_BM25", 1 ); ///< statistical mode, BM25 ranking only (faster but worse quality) define ( "SPH_RANK_NONE", 2 ); ///< no ranking, all matches get a weight of 1 define ( "SPH_RANK_WORDCOUNT", 3 ); ///< simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts define ( "SPH_RANK_PROXIMITY", 4 ); define ( "SPH_RANK_MATCHANY", 5 ); define ( "SPH_RANK_FIELDMASK", 6 ); /// known sort modes define ( "SPH_SORT_RELEVANCE", 0 ); define ( "SPH_SORT_ATTR_DESC", 1 ); define ( "SPH_SORT_ATTR_ASC", 2 ); define ( "SPH_SORT_TIME_SEGMENTS", 3 ); define ( "SPH_SORT_EXTENDED", 4 ); define ( "SPH_SORT_EXPR", 5 ); /// known filter types define ( "SPH_FILTER_VALUES", 0 ); define ( "SPH_FILTER_RANGE", 1 ); define ( "SPH_FILTER_FLOATRANGE", 2 ); /// known attribute types define ( "SPH_ATTR_INTEGER", 1 ); define ( "SPH_ATTR_TIMESTAMP", 2 ); define ( "SPH_ATTR_ORDINAL", 3 ); define ( "SPH_ATTR_BOOL", 4 ); define ( "SPH_ATTR_FLOAT", 5 ); define ( "SPH_ATTR_BIGINT", 6 ); define ( "SPH_ATTR_MULTI", 0x40000000 ); /// known grouping functions define ( "SPH_GROUPBY_DAY", 0 ); define ( "SPH_GROUPBY_WEEK", 1 ); define ( "SPH_GROUPBY_MONTH", 2 ); define ( "SPH_GROUPBY_YEAR", 3 ); define ( "SPH_GROUPBY_ATTR", 4 ); define ( "SPH_GROUPBY_ATTRPAIR", 5 ); // important properties of PHP's integers: // - always signed (one bit short of PHP_INT_SIZE) // - conversion from string to int is saturated // - float is double // - div converts arguments to floats // - mod converts arguments to ints // the packing code below works as follows: // - when we got an int, just pack it // if performance is a problem, this is the branch users should aim for // // - otherwise, we got a number in string form // this might be due to different reasons, but we assume that this is // because it didn't fit into PHP int // // - factor the string into high and low ints for packing // - if we have bcmath, then it is used // - if we don't, we have to do it manually (this is the fun part) // // - x64 branch does factoring using ints // - x32 (ab)uses floats, since we can't fit unsigned 32-bit number into an int // // unpacking routines are pretty much the same. // - return ints if we can // - otherwise format number into a string /// pack 64-bit signed function sphPackI64 ( $v ) { assert ( is_numeric($v) ); // x64 if ( PHP_INT_SIZE>=8 ) { $v = (int)$v; return pack ( "NN", $v>>32, $v&0xFFFFFFFF ); } // x32, int if ( is_int($v) ) return pack ( "NN", $v < 0 ? -1 : 0, $v ); // x32, bcmath if ( function_exists("bcmul") ) { if ( bccomp ( $v, 0 ) == -1 ) $v = bcadd ( "18446744073709551616", $v ); $h = bcdiv ( $v, "4294967296", 0 ); $l = bcmod ( $v, "4294967296" ); return pack ( "NN", (float)$h, (float)$l ); // conversion to float is intentional; int would lose 31st bit } // x32, no-bcmath $p = max(0, strlen($v) - 13); $lo = abs((float)substr($v, $p)); $hi = abs((float)substr($v, 0, $p)); $m = $lo + $hi*1316134912.0; // (10 ^ 13) % (1 << 32) = 1316134912 $q = floor($m/4294967296.0); $l = $m - ($q*4294967296.0); $h = $hi*2328.0 + $q; // (10 ^ 13) / (1 << 32) = 2328 if ( $v<0 ) { if ( $l==0 ) $h = 4294967296.0 - $h; else { $h = 4294967295.0 - $h; $l = 4294967296.0 - $l; } } return pack ( "NN", $h, $l ); } /// pack 64-bit unsigned function sphPackU64 ( $v ) { assert ( is_numeric($v) ); // x64 if ( PHP_INT_SIZE>=8 ) { assert ( $v>=0 ); // x64, int if ( is_int($v) ) return pack ( "NN", $v>>32, $v&0xFFFFFFFF ); // x64, bcmath if ( function_exists("bcmul") ) { $h = bcdiv ( $v, 4294967296, 0 ); $l = bcmod ( $v, 4294967296 ); return pack ( "NN", $h, $l ); } // x64, no-bcmath $p = max ( 0, strlen($v) - 13 ); $lo = (int)substr ( $v, $p ); $hi = (int)substr ( $v, 0, $p ); $m = $lo + $hi*1316134912; $l = $m % 4294967296; $h = $hi*2328 + (int)($m/4294967296); return pack ( "NN", $h, $l ); } // x32, int if ( is_int($v) ) return pack ( "NN", 0, $v ); // x32, bcmath if ( function_exists("bcmul") ) { $h = bcdiv ( $v, "4294967296", 0 ); $l = bcmod ( $v, "4294967296" ); return pack ( "NN", (float)$h, (float)$l ); // conversion to float is intentional; int would lose 31st bit } // x32, no-bcmath $p = max(0, strlen($v) - 13); $lo = (float)substr($v, $p); $hi = (float)substr($v, 0, $p); $m = $lo + $hi*1316134912.0; $q = floor($m / 4294967296.0); $l = $m - ($q * 4294967296.0); $h = $hi*2328.0 + $q; return pack ( "NN", $h, $l ); } // unpack 64-bit unsigned function sphUnpackU64 ( $v ) { list ( $hi, $lo ) = array_values ( unpack ( "N*N*", $v ) ); if ( PHP_INT_SIZE>=8 ) { if ( $hi<0 ) $hi += (1<<32); // because php 5.2.2 to 5.2.5 is totally fucked up again if ( $lo<0 ) $lo += (1<<32); // x64, int if ( $hi<=2147483647 ) return ($hi<<32) + $lo; // x64, bcmath if ( function_exists("bcmul") ) return bcadd ( $lo, bcmul ( $hi, "4294967296" ) ); // x64, no-bcmath $C = 100000; $h = ((int)($hi / $C) << 32) + (int)($lo / $C); $l = (($hi % $C) << 32) + ($lo % $C); if ( $l>$C ) { $h += (int)($l / $C); $l = $l % $C; } if ( $h==0 ) return $l; return sprintf ( "%d%05d", $h, $l ); } // x32, int if ( $hi==0 ) { if ( $lo>0 ) return $lo; return sprintf ( "%u", $lo ); } $hi = sprintf ( "%u", $hi ); $lo = sprintf ( "%u", $lo ); // x32, bcmath if ( function_exists("bcmul") ) return bcadd ( $lo, bcmul ( $hi, "4294967296" ) ); // x32, no-bcmath $hi = (float)$hi; $lo = (float)$lo; $q = floor($hi/10000000.0); $r = $hi - $q*10000000.0; $m = $lo + $r*4967296.0; $mq = floor($m/10000000.0); $l = $m - $mq*10000000.0; $h = $q*4294967296.0 + $r*429.0 + $mq; $h = sprintf ( "%.0f", $h ); $l = sprintf ( "%07.0f", $l ); if ( $h=="0" ) return sprintf( "%.0f", (float)$l ); return $h . $l; } // unpack 64-bit signed function sphUnpackI64 ( $v ) { list ( $hi, $lo ) = array_values ( unpack ( "N*N*", $v ) ); // x64 if ( PHP_INT_SIZE>=8 ) { if ( $hi<0 ) $hi += (1<<32); // because php 5.2.2 to 5.2.5 is totally fucked up again if ( $lo<0 ) $lo += (1<<32); return ($hi<<32) + $lo; } // x32, int if ( $hi==0 ) { if ( $lo>0 ) return $lo; return sprintf ( "%u", $lo ); } // x32, int elseif ( $hi==-1 ) { if ( $lo<0 ) return $lo; return sprintf ( "%.0f", $lo - 4294967296.0 ); } $neg = ""; $c = 0; if ( $hi<0 ) { $hi = ~$hi; $lo = ~$lo; $c = 1; $neg = "-"; } $hi = sprintf ( "%u", $hi ); $lo = sprintf ( "%u", $lo ); // x32, bcmath if ( function_exists("bcmul") ) return $neg . bcadd ( bcadd ( $lo, bcmul ( $hi, "4294967296" ) ), $c ); // x32, no-bcmath $hi = (float)$hi; $lo = (float)$lo; $q = floor($hi/10000000.0); $r = $hi - $q*10000000.0; $m = $lo + $r*4967296.0; $mq = floor($m/10000000.0); $l = $m - $mq*10000000.0 + $c; $h = $q*4294967296.0 + $r*429.0 + $mq; if ( $l==10000000 ) { $l = 0; $h += 1; } $h = sprintf ( "%.0f", $h ); $l = sprintf ( "%07.0f", $l ); if ( $h=="0" ) return $neg . sprintf( "%.0f", (float)$l ); return $neg . $h . $l; } function sphFixUint ( $value ) { if ( PHP_INT_SIZE>=8 ) { // x64 route, workaround broken unpack() in 5.2.2+ if ( $value<0 ) $value += (1<<32); return $value; } else { // x32 route, workaround php signed/unsigned braindamage return sprintf ( "%u", $value ); } } /// sphinx searchd client class class SphinxClient { var $_host; ///< searchd host (default is "localhost") var $_port; ///< searchd port (default is 9312) var $_offset; ///< how many records to seek from result-set start (default is 0) var $_limit; ///< how many records to return from result-set starting at offset (default is 20) var $_mode; ///< query matching mode (default is SPH_MATCH_ALL) var $_weights; ///< per-field weights (default is 1 for all fields) var $_sort; ///< match sorting mode (default is SPH_SORT_RELEVANCE) var $_sortby; ///< attribute to sort by (defualt is "") var $_min_id; ///< min ID to match (default is 0, which means no limit) var $_max_id; ///< max ID to match (default is 0, which means no limit) var $_filters; ///< search filters var $_groupby; ///< group-by attribute name var $_groupfunc; ///< group-by function (to pre-process group-by attribute value with) var $_groupsort; ///< group-by sorting clause (to sort groups in result set with) var $_groupdistinct;///< group-by count-distinct attribute var $_maxmatches; ///< max matches to retrieve var $_cutoff; ///< cutoff to stop searching at (default is 0) var $_retrycount; ///< distributed retries count var $_retrydelay; ///< distributed retries delay var $_anchor; ///< geographical anchor point var $_indexweights; ///< per-index weights var $_ranker; ///< ranking mode (default is SPH_RANK_PROXIMITY_BM25) var $_maxquerytime; ///< max query time, milliseconds (default is 0, do not limit) var $_fieldweights; ///< per-field-name weights var $_overrides; ///< per-query attribute values overrides var $_select; ///< select-list (attributes or expressions, with optional aliases) var $_error; ///< last error message var $_warning; ///< last warning message var $_connerror; ///< connection error vs remote error flag var $_reqs; ///< requests array for multi-query var $_mbenc; ///< stored mbstring encoding var $_arrayresult; ///< whether $result["matches"] should be a hash or an array var $_timeout; ///< connect timeout ///////////////////////////////////////////////////////////////////////////// // common stuff ///////////////////////////////////////////////////////////////////////////// /// create a new client object and fill defaults function SphinxClient () { // per-client-object settings $this->_host = "localhost"; $this->_port = 9312; $this->_path = false; $this->_socket = false; // per-query settings $this->_offset = 0; $this->_limit = 999999; $this->_mode = SPH_MATCH_ALL; $this->_weights = array (); $this->_sort = SPH_SORT_RELEVANCE; $this->_sortby = ""; $this->_min_id = 0; $this->_max_id = 0; $this->_filters = array (); $this->_groupby = ""; $this->_groupfunc = SPH_GROUPBY_DAY; $this->_groupsort = "@group desc"; $this->_groupdistinct= ""; $this->_maxmatches = 1000; $this->_cutoff = 0; $this->_retrycount = 0; $this->_retrydelay = 0; $this->_anchor = array (); $this->_indexweights= array (); $this->_ranker = SPH_RANK_PROXIMITY_BM25; $this->_maxquerytime= 0; $this->_fieldweights= array(); $this->_overrides = array(); $this->_select = "*"; $this->_error = ""; // per-reply fields (for single-query case) $this->_warning = ""; $this->_connerror = false; $this->_reqs = array (); // requests storage (for multi-query case) $this->_mbenc = ""; $this->_arrayresult = false; $this->_timeout = 0; } function __destruct() { if ( $this->_socket !== false ) fclose ( $this->_socket ); } /// get last error message (string) function GetLastError () { return $this->_error; } /// get last warning message (string) function GetLastWarning () { return $this->_warning; } /// get last error flag (to tell network connection errors from searchd errors or broken responses) function IsConnectError() { return $this->_connerror; } /// set searchd host name (string) and port (integer) function SetServer ( $host, $port = 0 ) { assert ( is_string($host) ); if ( $host[0] == '/') { $this->_path = 'unix://' . $host; return; } if ( substr ( $host, 0, 7 )=="unix://" ) { $this->_path = $host; return; } assert ( is_int($port) ); $this->_host = $host; $this->_port = $port; $this->_path = ''; } /// set server connection timeout (0 to remove) function SetConnectTimeout ( $timeout ) { assert ( is_numeric($timeout) ); $this->_timeout = $timeout; } function _Send ( $handle, $data, $length ) { if ( feof($handle) || fwrite ( $handle, $data, $length ) !== $length ) { $this->_error = 'connection unexpectedly closed (timed out?)'; $this->_connerror = true; return false; } return true; } ///////////////////////////////////////////////////////////////////////////// /// enter mbstring workaround mode function _MBPush () { $this->_mbenc = ""; if ( ini_get ( "mbstring.func_overload" ) & 2 ) { $this->_mbenc = mb_internal_encoding(); mb_internal_encoding ( "latin1" ); } } /// leave mbstring workaround mode function _MBPop () { if ( $this->_mbenc ) mb_internal_encoding ( $this->_mbenc ); } /// connect to searchd server function _Connect () { if ( $this->_socket!==false ) { // we are in persistent connection mode, so we have a socket // however, need to check whether it's still alive if ( !@feof ( $this->_socket ) ) return $this->_socket; // force reopen $this->_socket = false; } $errno = 0; $errstr = ""; $this->_connerror = false; if ( $this->_path ) { $host = $this->_path; $port = 0; } else { $host = $this->_host; $port = $this->_port; } if ( $this->_timeout<=0 ) $fp = @fsockopen ( $host, $port, $errno, $errstr ); else $fp = @fsockopen ( $host, $port, $errno, $errstr, $this->_timeout ); if ( !$fp ) { if ( $this->_path ) $location = $this->_path; else $location = "{$this->_host}:{$this->_port}"; $errstr = trim ( $errstr ); $this->_error = "connection to $location failed (errno=$errno, msg=$errstr)"; $this->_connerror = true; return false; } // send my version // this is a subtle part. we must do it before (!) reading back from searchd. // because otherwise under some conditions (reported on FreeBSD for instance) // TCP stack could throttle write-write-read pattern because of Nagle. if ( !$this->_Send ( $fp, pack ( "N", 1 ), 4 ) ) { fclose ( $fp ); $this->_error = "failed to send client protocol version"; return false; } // check version list(,$v) = unpack ( "N*", fread ( $fp, 4 ) ); $v = (int)$v; if ( $v<1 ) { fclose ( $fp ); $this->_error = "expected searchd protocol version 1+, got version '$v'"; return false; } return $fp; } /// get and check response packet from searchd server function _GetResponse ( $fp, $client_ver ) { $response = ""; $len = 0; $header = fread ( $fp, 8 ); if ( strlen($header)==8 ) { list ( $status, $ver, $len ) = array_values ( unpack ( "n2a/Nb", $header ) ); $left = $len; while ( $left>0 && !feof($fp) ) { $chunk = fread ( $fp, $left ); if ( $chunk ) { $response .= $chunk; $left -= strlen($chunk); } } } if ( $this->_socket === false ) fclose ( $fp ); // check response $read = strlen ( $response ); if ( !$response || $read!=$len ) { $this->_error = $len ? "failed to read searchd response (status=$status, ver=$ver, len=$len, read=$read)" : "received zero-sized searchd response"; return false; } // check status if ( $status==SEARCHD_WARNING ) { list(,$wlen) = unpack ( "N*", substr ( $response, 0, 4 ) ); $this->_warning = substr ( $response, 4, $wlen ); return substr ( $response, 4+$wlen ); } if ( $status==SEARCHD_ERROR ) { $this->_error = "searchd error: " . substr ( $response, 4 ); return false; } if ( $status==SEARCHD_RETRY ) { $this->_error = "temporary searchd error: " . substr ( $response, 4 ); return false; } if ( $status!=SEARCHD_OK ) { $this->_error = "unknown status code '$status'"; return false; } // check version if ( $ver<$client_ver ) { $this->_warning = sprintf ( "searchd command v.%d.%d older than client's v.%d.%d, some options might not work", $ver>>8, $ver&0xff, $client_ver>>8, $client_ver&0xff ); } return $response; } ///////////////////////////////////////////////////////////////////////////// // searching ///////////////////////////////////////////////////////////////////////////// /// set offset and count into result set, /// and optionally set max-matches and cutoff limits function SetLimits ( $offset, $limit, $max=0, $cutoff=0 ) { assert ( is_int($offset) ); assert ( is_int($limit) ); assert ( $offset>=0 ); assert ( $limit>0 ); assert ( $max>=0 ); $this->_offset = $offset; $this->_limit = $limit; if ( $max>0 ) $this->_maxmatches = $max; if ( $cutoff>0 ) $this->_cutoff = $cutoff; } /// set maximum query time, in milliseconds, per-index /// integer, 0 means "do not limit" function SetMaxQueryTime ( $max ) { assert ( is_int($max) ); assert ( $max>=0 ); $this->_maxquerytime = $max; } /// set matching mode function SetMatchMode ( $mode ) { assert ( $mode==SPH_MATCH_ALL || $mode==SPH_MATCH_ANY || $mode==SPH_MATCH_PHRASE || $mode==SPH_MATCH_BOOLEAN || $mode==SPH_MATCH_EXTENDED || $mode==SPH_MATCH_FULLSCAN || $mode==SPH_MATCH_EXTENDED2 ); $this->_mode = $mode; } /// set ranking mode function SetRankingMode ( $ranker ) { assert ( $ranker==SPH_RANK_PROXIMITY_BM25 || $ranker==SPH_RANK_BM25 || $ranker==SPH_RANK_NONE || $ranker==SPH_RANK_WORDCOUNT || $ranker==SPH_RANK_PROXIMITY ); $this->_ranker = $ranker; } /// set matches sorting mode function SetSortMode ( $mode, $sortby="" ) { assert ( $mode==SPH_SORT_RELEVANCE || $mode==SPH_SORT_ATTR_DESC || $mode==SPH_SORT_ATTR_ASC || $mode==SPH_SORT_TIME_SEGMENTS || $mode==SPH_SORT_EXTENDED || $mode==SPH_SORT_EXPR ); assert ( is_string($sortby) ); assert ( $mode==SPH_SORT_RELEVANCE || strlen($sortby)>0 ); $this->_sort = $mode; $this->_sortby = $sortby; } /// bind per-field weights by order /// DEPRECATED; use SetFieldWeights() instead function SetWeights ( $weights ) { assert ( is_array($weights) ); foreach ( $weights as $weight ) assert ( is_int($weight) ); $this->_weights = $weights; } /// bind per-field weights by name function SetFieldWeights ( $weights ) { assert ( is_array($weights) ); foreach ( $weights as $name=>$weight ) { assert ( is_string($name) ); assert ( is_int($weight) ); } $this->_fieldweights = $weights; } /// bind per-index weights by name function SetIndexWeights ( $weights ) { assert ( is_array($weights) ); foreach ( $weights as $index=>$weight ) { assert ( is_string($index) ); assert ( is_int($weight) ); } $this->_indexweights = $weights; } /// set IDs range to match /// only match records if document ID is beetwen $min and $max (inclusive) function SetIDRange ( $min, $max ) { assert ( is_numeric($min) ); assert ( is_numeric($max) ); assert ( $min<=$max ); $this->_min_id = $min; $this->_max_id = $max; } /// set values set filter /// only match records where $attribute value is in given set function SetFilter ( $attribute, $values, $exclude=false ) { assert ( is_string($attribute) ); assert ( is_array($values) ); assert ( count($values) ); if ( is_array($values) && count($values) ) { foreach ( $values as $value ) assert ( is_numeric($value) ); $this->_filters[] = array ( "type"=>SPH_FILTER_VALUES, "attr"=>$attribute, "exclude"=>$exclude, "values"=>$values ); } } /// set range filter /// only match records if $attribute value is beetwen $min and $max (inclusive) function SetFilterRange ( $attribute, $min, $max, $exclude=false ) { assert ( is_string($attribute) ); assert ( is_numeric($min) ); assert ( is_numeric($max) ); assert ( $min<=$max ); $this->_filters[] = array ( "type"=>SPH_FILTER_RANGE, "attr"=>$attribute, "exclude"=>$exclude, "min"=>$min, "max"=>$max ); } /// set float range filter /// only match records if $attribute value is beetwen $min and $max (inclusive) function SetFilterFloatRange ( $attribute, $min, $max, $exclude=false ) { assert ( is_string($attribute) ); assert ( is_float($min) ); assert ( is_float($max) ); assert ( $min<=$max ); $this->_filters[] = array ( "type"=>SPH_FILTER_FLOATRANGE, "attr"=>$attribute, "exclude"=>$exclude, "min"=>$min, "max"=>$max ); } /// setup anchor point for geosphere distance calculations /// required to use @geodist in filters and sorting /// latitude and longitude must be in radians function SetGeoAnchor ( $attrlat, $attrlong, $lat, $long ) { assert ( is_string($attrlat) ); assert ( is_string($attrlong) ); assert ( is_float($lat) ); assert ( is_float($long) ); $this->_anchor = array ( "attrlat"=>$attrlat, "attrlong"=>$attrlong, "lat"=>$lat, "long"=>$long ); } /// set grouping attribute and function function SetGroupBy ( $attribute, $func, $groupsort="@group desc" ) { assert ( is_string($attribute) ); assert ( is_string($groupsort) ); assert ( $func==SPH_GROUPBY_DAY || $func==SPH_GROUPBY_WEEK || $func==SPH_GROUPBY_MONTH || $func==SPH_GROUPBY_YEAR || $func==SPH_GROUPBY_ATTR || $func==SPH_GROUPBY_ATTRPAIR ); $this->_groupby = $attribute; $this->_groupfunc = $func; $this->_groupsort = $groupsort; } /// set count-distinct attribute for group-by queries function SetGroupDistinct ( $attribute ) { assert ( is_string($attribute) ); $this->_groupdistinct = $attribute; } /// set distributed retries count and delay function SetRetries ( $count, $delay=0 ) { assert ( is_int($count) && $count>=0 ); assert ( is_int($delay) && $delay>=0 ); $this->_retrycount = $count; $this->_retrydelay = $delay; } /// set result set format (hash or array; hash by default) /// PHP specific; needed for group-by-MVA result sets that may contain duplicate IDs function SetArrayResult ( $arrayresult ) { assert ( is_bool($arrayresult) ); $this->_arrayresult = $arrayresult; } /// set attribute values override /// there can be only one override per attribute /// $values must be a hash that maps document IDs to attribute values function SetOverride ( $attrname, $attrtype, $values ) { assert ( is_string ( $attrname ) ); assert ( in_array ( $attrtype, array ( SPH_ATTR_INTEGER, SPH_ATTR_TIMESTAMP, SPH_ATTR_BOOL, SPH_ATTR_FLOAT, SPH_ATTR_BIGINT ) ) ); assert ( is_array ( $values ) ); $this->_overrides[$attrname] = array ( "attr"=>$attrname, "type"=>$attrtype, "values"=>$values ); } /// set select-list (attributes or expressions), SQL-like syntax function SetSelect ( $select ) { assert ( is_string ( $select ) ); $this->_select = $select; } ////////////////////////////////////////////////////////////////////////////// /// clear all filters (for multi-queries) function ResetFilters () { $this->_filters = array(); $this->_anchor = array(); } /// clear groupby settings (for multi-queries) function ResetGroupBy () { $this->_groupby = ""; $this->_groupfunc = SPH_GROUPBY_DAY; $this->_groupsort = "@group desc"; $this->_groupdistinct= ""; } /// clear all attribute value overrides (for multi-queries) function ResetOverrides () { $this->_overrides = array (); } ////////////////////////////////////////////////////////////////////////////// /// connect to searchd server, run given search query through given indexes, /// and return the search results function Query ( $query, $index="*", $comment="" ) { assert ( empty($this->_reqs) ); $this->AddQuery ( $query, $index, $comment ); $results = $this->RunQueries (); $this->_reqs = array (); // just in case it failed too early if ( !is_array($results) ) return false; // probably network error; error message should be already filled $this->_error = $results[0]["error"]; $this->_warning = $results[0]["warning"]; if ( $results[0]["status"]==SEARCHD_ERROR ) return false; else return $results[0]; } /// helper to pack floats in network byte order function _PackFloat ( $f ) { $t1 = pack ( "f", $f ); // machine order list(,$t2) = unpack ( "L*", $t1 ); // int in machine order return pack ( "N", $t2 ); } /// add query to multi-query batch /// returns index into results array from RunQueries() call function AddQuery ( $query, $index="*", $comment="" ) { // mbstring workaround $this->_MBPush (); // build request $req = pack ( "NNNNN", $this->_offset, $this->_limit, $this->_mode, $this->_ranker, $this->_sort ); // mode and limits $req .= pack ( "N", strlen($this->_sortby) ) . $this->_sortby; $req .= pack ( "N", strlen($query) ) . $query; // query itself $req .= pack ( "N", count($this->_weights) ); // weights foreach ( $this->_weights as $weight ) $req .= pack ( "N", (int)$weight ); $req .= pack ( "N", strlen($index) ) . $index; // indexes $req .= pack ( "N", 1 ); // id64 range marker $req .= sphPackU64 ( $this->_min_id ) . sphPackU64 ( $this->_max_id ); // id64 range // filters $req .= pack ( "N", count($this->_filters) ); foreach ( $this->_filters as $filter ) { $req .= pack ( "N", strlen($filter["attr"]) ) . $filter["attr"]; $req .= pack ( "N", $filter["type"] ); switch ( $filter["type"] ) { case SPH_FILTER_VALUES: $req .= pack ( "N", count($filter["values"]) ); foreach ( $filter["values"] as $value ) $req .= sphPackI64 ( $value ); break; case SPH_FILTER_RANGE: $req .= sphPackI64 ( $filter["min"] ) . sphPackI64 ( $filter["max"] ); break; case SPH_FILTER_FLOATRANGE: $req .= $this->_PackFloat ( $filter["min"] ) . $this->_PackFloat ( $filter["max"] ); break; default: assert ( 0 && "internal error: unhandled filter type" ); } $req .= pack ( "N", $filter["exclude"] ); } // group-by clause, max-matches count, group-sort clause, cutoff count $req .= pack ( "NN", $this->_groupfunc, strlen($this->_groupby) ) . $this->_groupby; $req .= pack ( "N", $this->_maxmatches ); $req .= pack ( "N", strlen($this->_groupsort) ) . $this->_groupsort; $req .= pack ( "NNN", $this->_cutoff, $this->_retrycount, $this->_retrydelay ); $req .= pack ( "N", strlen($this->_groupdistinct) ) . $this->_groupdistinct; // anchor point if ( empty($this->_anchor) ) { $req .= pack ( "N", 0 ); } else { $a =& $this->_anchor; $req .= pack ( "N", 1 ); $req .= pack ( "N", strlen($a["attrlat"]) ) . $a["attrlat"]; $req .= pack ( "N", strlen($a["attrlong"]) ) . $a["attrlong"]; $req .= $this->_PackFloat ( $a["lat"] ) . $this->_PackFloat ( $a["long"] ); } // per-index weights $req .= pack ( "N", count($this->_indexweights) ); foreach ( $this->_indexweights as $idx=>$weight ) $req .= pack ( "N", strlen($idx) ) . $idx . pack ( "N", $weight ); // max query time $req .= pack ( "N", $this->_maxquerytime ); // per-field weights $req .= pack ( "N", count($this->_fieldweights) ); foreach ( $this->_fieldweights as $field=>$weight ) $req .= pack ( "N", strlen($field) ) . $field . pack ( "N", $weight ); // comment $req .= pack ( "N", strlen($comment) ) . $comment; // attribute overrides $req .= pack ( "N", count($this->_overrides) ); foreach ( $this->_overrides as $key => $entry ) { $req .= pack ( "N", strlen($entry["attr"]) ) . $entry["attr"]; $req .= pack ( "NN", $entry["type"], count($entry["values"]) ); foreach ( $entry["values"] as $id=>$val ) { assert ( is_numeric($id) ); assert ( is_numeric($val) ); $req .= sphPackU64 ( $id ); switch ( $entry["type"] ) { case SPH_ATTR_FLOAT: $req .= $this->_PackFloat ( $val ); break; case SPH_ATTR_BIGINT: $req .= sphPackI64 ( $val ); break; default: $req .= pack ( "N", $val ); break; } } } // select-list $req .= pack ( "N", strlen($this->_select) ) . $this->_select; // mbstring workaround $this->_MBPop (); // store request to requests array $this->_reqs[] = $req; return count($this->_reqs)-1; } /// connect to searchd, run queries batch, and return an array of result sets function RunQueries () { if ( empty($this->_reqs) ) { $this->_error = "no queries defined, issue AddQuery() first"; return false; } // mbstring workaround $this->_MBPush (); if (!( $fp = $this->_Connect() )) { $this->_MBPop (); return false; } // send query, get response $nreqs = count($this->_reqs); $req = join ( "", $this->_reqs ); $len = 4+strlen($req); $req = pack ( "nnNN", SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, $len, $nreqs ) . $req; // add header if ( !( $this->_Send ( $fp, $req, $len+8 ) ) || !( $response = $this->_GetResponse ( $fp, VER_COMMAND_SEARCH ) ) ) { $this->_MBPop (); return false; } // query sent ok; we can reset reqs now $this->_reqs = array (); // parse and return response return $this->_ParseSearchResponse ( $response, $nreqs ); } /// parse and return search query (or queries) response function _ParseSearchResponse ( $response, $nreqs ) { $p = 0; // current position $max = strlen($response); // max position for checks, to protect against broken responses $results = array (); for ( $ires=0; $ires<$nreqs && $p<$max; $ires++ ) { $results[] = array(); $result =& $results[$ires]; $result["error"] = ""; $result["warning"] = ""; // extract status list(,$status) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $result["status"] = $status; if ( $status!=SEARCHD_OK ) { list(,$len) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $message = substr ( $response, $p, $len ); $p += $len; if ( $status==SEARCHD_WARNING ) { $result["warning"] = $message; } else { $result["error"] = $message; continue; } } // read schema $fields = array (); $attrs = array (); list(,$nfields) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; while ( $nfields-->0 && $p<$max ) { list(,$len) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $fields[] = substr ( $response, $p, $len ); $p += $len; } $result["fields"] = $fields; list(,$nattrs) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; while ( $nattrs-->0 && $p<$max ) { list(,$len) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $attr = substr ( $response, $p, $len ); $p += $len; list(,$type) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $attrs[$attr] = $type; } $result["attrs"] = $attrs; // read match count list(,$count) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; list(,$id64) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; // read matches $idx = -1; while ( $count-->0 && $p<$max ) { // index into result array $idx++; // parse document id and weight if ( $id64 ) { $doc = sphUnpackU64 ( substr ( $response, $p, 8 ) ); $p += 8; list(,$weight) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; } else { list ( $doc, $weight ) = array_values ( unpack ( "N*N*", substr ( $response, $p, 8 ) ) ); $p += 8; $doc = sphFixUint($doc); } $weight = sprintf ( "%u", $weight ); // create match entry if ( $this->_arrayresult ) $result["matches"][$idx] = array ( "id"=>$doc, "weight"=>$weight ); else $result["matches"][$doc]["weight"] = $weight; // parse and create attributes $attrvals = array (); foreach ( $attrs as $attr=>$type ) { // handle 64bit ints if ( $type==SPH_ATTR_BIGINT ) { $attrvals[$attr] = sphUnpackI64 ( substr ( $response, $p, 8 ) ); $p += 8; continue; } // handle floats if ( $type==SPH_ATTR_FLOAT ) { list(,$uval) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; list(,$fval) = unpack ( "f*", pack ( "L", $uval ) ); $attrvals[$attr] = $fval; continue; } // handle everything else as unsigned ints list(,$val) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; if ( $type & SPH_ATTR_MULTI ) { $attrvals[$attr] = array (); $nvalues = $val; while ( $nvalues-->0 && $p<$max ) { list(,$val) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $attrvals[$attr][] = sphFixUint($val); } } else { $attrvals[$attr] = sphFixUint($val); } } if ( $this->_arrayresult ) $result["matches"][$idx]["attrs"] = $attrvals; else $result["matches"][$doc]["attrs"] = $attrvals; } list ( $total, $total_found, $msecs, $words ) = array_values ( unpack ( "N*N*N*N*", substr ( $response, $p, 16 ) ) ); $result["total"] = sprintf ( "%u", $total ); $result["total_found"] = sprintf ( "%u", $total_found ); $result["time"] = sprintf ( "%.3f", $msecs/1000 ); $p += 16; while ( $words-->0 && $p<$max ) { list(,$len) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $word = substr ( $response, $p, $len ); $p += $len; list ( $docs, $hits ) = array_values ( unpack ( "N*N*", substr ( $response, $p, 8 ) ) ); $p += 8; $result["words"][$word] = array ( "docs"=>sprintf ( "%u", $docs ), "hits"=>sprintf ( "%u", $hits ) ); } } $this->_MBPop (); return $results; } ///////////////////////////////////////////////////////////////////////////// // excerpts generation ///////////////////////////////////////////////////////////////////////////// /// connect to searchd server, and generate exceprts (snippets) /// of given documents for given query. returns false on failure, /// an array of snippets on success function BuildExcerpts ( $docs, $index, $words, $opts=array() ) { assert ( is_array($docs) ); assert ( is_string($index) ); assert ( is_string($words) ); assert ( is_array($opts) ); $this->_MBPush (); if (!( $fp = $this->_Connect() )) { $this->_MBPop(); return false; } ///////////////// // fixup options ///////////////// if ( !isset($opts["before_match"]) ) $opts["before_match"] = "<b>"; if ( !isset($opts["after_match"]) ) $opts["after_match"] = "</b>"; if ( !isset($opts["chunk_separator"]) ) $opts["chunk_separator"] = " ... "; if ( !isset($opts["limit"]) ) $opts["limit"] = 256; if ( !isset($opts["around"]) ) $opts["around"] = 5; if ( !isset($opts["exact_phrase"]) ) $opts["exact_phrase"] = false; if ( !isset($opts["single_passage"]) ) $opts["single_passage"] = false; if ( !isset($opts["use_boundaries"]) ) $opts["use_boundaries"] = false; if ( !isset($opts["weight_order"]) ) $opts["weight_order"] = false; ///////////////// // build request ///////////////// // v.1.0 req $flags = 1; // remove spaces if ( $opts["exact_phrase"] ) $flags |= 2; if ( $opts["single_passage"] ) $flags |= 4; if ( $opts["use_boundaries"] ) $flags |= 8; if ( $opts["weight_order"] ) $flags |= 16; $req = pack ( "NN", 0, $flags ); // mode=0, flags=$flags $req .= pack ( "N", strlen($index) ) . $index; // req index $req .= pack ( "N", strlen($words) ) . $words; // req words // options $req .= pack ( "N", strlen($opts["before_match"]) ) . $opts["before_match"]; $req .= pack ( "N", strlen($opts["after_match"]) ) . $opts["after_match"]; $req .= pack ( "N", strlen($opts["chunk_separator"]) ) . $opts["chunk_separator"]; $req .= pack ( "N", (int)$opts["limit"] ); $req .= pack ( "N", (int)$opts["around"] ); // documents $req .= pack ( "N", count($docs) ); foreach ( $docs as $doc ) { assert ( is_string($doc) ); $req .= pack ( "N", strlen($doc) ) . $doc; } //////////////////////////// // send query, get response //////////////////////////// $len = strlen($req); $req = pack ( "nnN", SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, $len ) . $req; // add header if ( !( $this->_Send ( $fp, $req, $len+8 ) ) || !( $response = $this->_GetResponse ( $fp, VER_COMMAND_EXCERPT ) ) ) { $this->_MBPop (); return false; } ////////////////// // parse response ////////////////// $pos = 0; $res = array (); $rlen = strlen($response); for ( $i=0; $i<count($docs); $i++ ) { list(,$len) = unpack ( "N*", substr ( $response, $pos, 4 ) ); $pos += 4; if ( $pos+$len > $rlen ) { $this->_error = "incomplete reply"; $this->_MBPop (); return false; } $res[] = $len ? substr ( $response, $pos, $len ) : ""; $pos += $len; } $this->_MBPop (); return $res; } ///////////////////////////////////////////////////////////////////////////// // keyword generation ///////////////////////////////////////////////////////////////////////////// /// connect to searchd server, and generate keyword list for a given query /// returns false on failure, /// an array of words on success function BuildKeywords ( $query, $index, $hits ) { assert ( is_string($query) ); assert ( is_string($index) ); assert ( is_bool($hits) ); $this->_MBPush (); if (!( $fp = $this->_Connect() )) { $this->_MBPop(); return false; } ///////////////// // build request ///////////////// // v.1.0 req $req = pack ( "N", strlen($query) ) . $query; // req query $req .= pack ( "N", strlen($index) ) . $index; // req index $req .= pack ( "N", (int)$hits ); //////////////////////////// // send query, get response //////////////////////////// $len = strlen($req); $req = pack ( "nnN", SEARCHD_COMMAND_KEYWORDS, VER_COMMAND_KEYWORDS, $len ) . $req; // add header if ( !( $this->_Send ( $fp, $req, $len+8 ) ) || !( $response = $this->_GetResponse ( $fp, VER_COMMAND_KEYWORDS ) ) ) { $this->_MBPop (); return false; } ////////////////// // parse response ////////////////// $pos = 0; $res = array (); $rlen = strlen($response); list(,$nwords) = unpack ( "N*", substr ( $response, $pos, 4 ) ); $pos += 4; for ( $i=0; $i<$nwords; $i++ ) { list(,$len) = unpack ( "N*", substr ( $response, $pos, 4 ) ); $pos += 4; $tokenized = $len ? substr ( $response, $pos, $len ) : ""; $pos += $len; list(,$len) = unpack ( "N*", substr ( $response, $pos, 4 ) ); $pos += 4; $normalized = $len ? substr ( $response, $pos, $len ) : ""; $pos += $len; $res[] = array ( "tokenized"=>$tokenized, "normalized"=>$normalized ); if ( $hits ) { list($ndocs,$nhits) = array_values ( unpack ( "N*N*", substr ( $response, $pos, 8 ) ) ); $pos += 8; $res [$i]["docs"] = $ndocs; $res [$i]["hits"] = $nhits; } if ( $pos > $rlen ) { $this->_error = "incomplete reply"; $this->_MBPop (); return false; } } $this->_MBPop (); return $res; } function EscapeString ( $string ) { $from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=' ); $to = array ( '\\\\', '\(','\)','\|','\-','\!','\@','\~','\"', '\&', '\/', '\^', '\$', '\=' ); return str_replace ( $from, $to, $string ); } ///////////////////////////////////////////////////////////////////////////// // attribute updates ///////////////////////////////////////////////////////////////////////////// /// batch update given attributes in given rows in given indexes /// returns amount of updated documents (0 or more) on success, or -1 on failure function UpdateAttributes ( $index, $attrs, $values, $mva=false ) { // verify everything assert ( is_string($index) ); assert ( is_bool($mva) ); assert ( is_array($attrs) ); foreach ( $attrs as $attr ) assert ( is_string($attr) ); assert ( is_array($values) ); foreach ( $values as $id=>$entry ) { assert ( is_numeric($id) ); assert ( is_array($entry) ); assert ( count($entry)==count($attrs) ); foreach ( $entry as $v ) { if ( $mva ) { assert ( is_array($v) ); foreach ( $v as $vv ) assert ( is_int($vv) ); } else assert ( is_int($v) ); } } // build request $req = pack ( "N", strlen($index) ) . $index; $req .= pack ( "N", count($attrs) ); foreach ( $attrs as $attr ) { $req .= pack ( "N", strlen($attr) ) . $attr; $req .= pack ( "N", $mva ? 1 : 0 ); } $req .= pack ( "N", count($values) ); foreach ( $values as $id=>$entry ) { $req .= sphPackU64 ( $id ); foreach ( $entry as $v ) { $req .= pack ( "N", $mva ? count($v) : $v ); if ( $mva ) foreach ( $v as $vv ) $req .= pack ( "N", $vv ); } } // connect, send query, get response if (!( $fp = $this->_Connect() )) return -1; $len = strlen($req); $req = pack ( "nnN", SEARCHD_COMMAND_UPDATE, VER_COMMAND_UPDATE, $len ) . $req; // add header if ( !$this->_Send ( $fp, $req, $len+8 ) ) return -1; if (!( $response = $this->_GetResponse ( $fp, VER_COMMAND_UPDATE ) )) return -1; // parse response list(,$updated) = unpack ( "N*", substr ( $response, 0, 4 ) ); return $updated; } ///////////////////////////////////////////////////////////////////////////// // persistent connections ///////////////////////////////////////////////////////////////////////////// function Open() { if ( $this->_socket !== false ) { $this->_error = 'already connected'; return false; } if ( !$fp = $this->_Connect() ) return false; // command, command version = 0, body length = 4, body = 1 $req = pack ( "nnNN", SEARCHD_COMMAND_PERSIST, 0, 4, 1 ); if ( !$this->_Send ( $fp, $req, 12 ) ) return false; $this->_socket = $fp; return true; } function Close() { if ( $this->_socket === false ) { $this->_error = 'not connected'; return false; } fclose ( $this->_socket ); $this->_socket = false; return true; } ////////////////////////////////////////////////////////////////////////// // status ////////////////////////////////////////////////////////////////////////// function Status () { $this->_MBPush (); if (!( $fp = $this->_Connect() )) { $this->_MBPop(); return false; } $req = pack ( "nnNN", SEARCHD_COMMAND_STATUS, VER_COMMAND_STATUS, 4, 1 ); // len=4, body=1 if ( !( $this->_Send ( $fp, $req, 12 ) ) || !( $response = $this->_GetResponse ( $fp, VER_COMMAND_STATUS ) ) ) { $this->_MBPop (); return false; } $res = substr ( $response, 4 ); // just ignore length, error handling, etc $p = 0; list ( $rows, $cols ) = array_values ( unpack ( "N*N*", substr ( $response, $p, 8 ) ) ); $p += 8; $res = array(); for ( $i=0; $i<$rows; $i++ ) for ( $j=0; $j<$cols; $j++ ) { list(,$len) = unpack ( "N*", substr ( $response, $p, 4 ) ); $p += 4; $res[$i][] = substr ( $response, $p, $len ); $p += $len; } $this->_MBPop (); return $res; } } // // $Id: sphinxapi.php 2055 2009-11-06 23:09:58Z shodan $ //
亲测性能提高10倍左右~效果还是挺显著的~