android sqlite 分词,sqlite3自定义分词器

sqlite3通过使用fts3虚表支持全文搜索,默认支持simple和porter两种分词器,并提供了接口来自定义分词器。这里我们利用mmseg来构造自定义的中文分词器。

虽然sqlite在fts3_tokenizer.h中提供了各种接口供用户自定义分词器,但其并未提供c函数供用户来注册自定义的分词器,分词器的注册必须使用sql语句来完成。

SELECTfts3_tokenizer(,);

其中tokenizer-name是分词器的名称,sqlite3_tokenizer_moduleptr只一个指向sqlite3_tokenizer_module结构的指针并且编码为SQLblob。下面是官方给出的注册函数:

int registerTokenizer(

sqlite3 *db,

char *zName,

const sqlite3_tokenizer_module *p

){

intrc;

sqlite3_stmt*pStmt;

const char*zSql = "SELECT fts3_tokenizer(?, ?)";

rc =sqlite3_prepare_v2(db, zSql, -1, &pStmt,0);

if(rc!=SQLITE_OK ){

return rc;

}

sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);

sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),SQLITE_STATIC);

sqlite3_step(pStmt);

returnsqlite3_finalize(pStmt);

}

要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下:

struct sqlite3_tokenizer_module {

int iVersion; //版本号,必须设置为0

int (*xCreate)( //创建虚表时自动调用并创建分词器

intargc, const char*const*argv, sqlite3_tokenizer**ppTokenizer );

int (*xDestroy)(sqlite3_tokenizer *pTokenizer);//数据库连接关闭时自动调用,用于销毁资源

int (*xOpen)( //插入数据或检索时自动调用以进行分词

sqlite3_tokenizer*pTokenizer, const char*pInput, intnBytes, sqlite3_tokenizer_cursor **ppCursor );

int(*xClose)(sqlite3_tokenizer_cursor *pCursor); //分词结果提取完毕后自动调用

int (*xNext)( //逐个提取分词结果

sqlite3_tokenizer_cursor*pCursor, const char**ppToken, int *pnBytes,

int*piStartOffset, int*piEndOffset, int*piPosition );

};

有几点需要注意的是:

1分词引擎使用sql语句注册意味着每建立一个sqlite连接都必须注册一次分词器,对于需要使用词库的中文分词器来说也意味着巨大的内存消耗。

2在检索时分词结果的提取和语义的解析式交替进行的。例如我们搜索"kanif ORsqlite"的时候,引擎先将全部传入到分词器,在调用一次next获取到词kanif后,在将词sqlite传入到分词器,直到全部解析完毕。

3由于中文分词本身的特殊性,例如"北京市"很有可能视为一个完整的词,这样在搜索"北京"的时候就无法获取到结果。如果分词器支持将"北京市"切分为"北京市"和"北京"或者将十一月切分为"11月"和"十一",那么需注意(*xNext)函数中的piStartOffset和piEndOffset参数。经测试在插入数据的时候这两个参数无实际用途,但在查询的时候这两个参数决定了下一次的输入串。

附:

#include

#include

#include

#include

#include

#include

#include"fts3_tokenizer.h"

#include "mmseg/mmseg.cpp"

static bool loadDic = true;

typedef struct cus_tokenizer{

sqlite3_tokenizer base;

} cus_tokenizer;

typedef struct cus_tokenizer_cursor{

sqlite3_tokenizer_cursor base;

char *pInput;

int nBytes;

int iToken;

char *pToken;

rmmseg::Algorithm *pAlgor;

} cus_tokenizer_cursor;

void initmmseg(void){

if(!loadDic)

return;

mmseg_load_words("chars.dic");

mmseg_load_words("words.dic");

loadDic =False;

}

static int cusCreate(

int argc, const char * const *argv,

sqlite3_tokenizer **ppTokenizer

){

cus_tokenizer *t;

t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t));

if( t==NULL ) return SQLITE_NOMEM;

memset(t, 0, sizeof(*t));

initmmseg();

*ppTokenizer = &t->base;

return SQLITE_OK;

}

static intcusDestroy(sqlite3_tokenizer *pTokenizer){

sqlite3_free(pTokenizer);

return SQLITE_OK;

}

static int cusOpen(

sqlite3_tokenizer*pTokenizer, const char *pInput, intnBytes, sqlite3_tokenizer_cursor**ppCursor ){

cus_tokenizer_cursor *c;

if(pInput == 0){

nBytes =0;

}else if(nBytes < 0)

nBytes = (int)strlen(pInput);

c = (cus_tokenizer_cursor *)sqlite3_malloc(sizeof(*c));

if(c == NULL)

return SQLITE_NOMEM;

c->iToken =c->nBytes = 0;

c->pInput = c->pToken =NULL;

c->pAlgor = mmseg_algor_create(pInput,nBytes);

c->nBytes = nBytes;

*ppCursor = &c->base;

return SQLITE_OK;

}

static intcusClose(sqlite3_tokenizer_cursor *pCursor){

cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;

if(c->pInput != NULL){

sqlite3_free(c->pInput);

}

if(c->pToken != NULL){

sqlite3_free(c->pToken);

}

if(c->pAlgor != NULL){

mmseg_algor_destroy(c->pAlgor);

}

c->pInput = c->pToken =NULL;

c->pAlgor = NULL;

sqlite3_free(c);

return SQLITE_OK;

}

static int cusNext(

sqlite3_tokenizer_cursor *pCursor,

const char**ppToken, int*pnBytes, int*piStartOffset, int*piEndOffset, int*piPosition ){

cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;

cus_tokenizer *t = (cus_tokenizer *)pCursor->pTokenizer;

if(c->pToken != NULL){

sqlite3_free(c->pToken);

c->pToken = NULL;

}

struct Token token =mmseg_next_token(c->pAlgor);

if(token.length != 0 ){

int l =token.length;

c->pToken = (char *)sqlite3_malloc(l+1);

if(c->pToken == NULL)

return SQLITE_NOMEM;

c->pToken[l] = 0;

memcpy(c->pToken, token.text, l);

*ppToken =c->pToken;

*pnBytes =l;

*piStartOffset = token.offset;

*piEndOffset= token.offset + token.length;

*piPosition= c->iToken++;

returnSQLITE_OK;

}

//一般来说只有插入数据时才会进入到这里

return SQLITE_DONE;

}

static const sqlite3_tokenizer_module cusTokenizerModule ={

0,

cusCreate,

cusDestroy,

cusOpen,

cusClose,

cusNext,

};

int registerTokenizer(

sqlite3 *db,

char *zName,

const sqlite3_tokenizer_module *p

){

intrc;

sqlite3_stmt*pStmt;

const char*zSql = "SELECT fts3_tokenizer(?, ?)";

rc =sqlite3_prepare_v2(db, zSql, -1, &pStmt,0);

if(rc!=SQLITE_OK ){

return rc;

}

sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);

sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),SQLITE_STATIC);

sqlite3_step(pStmt);

returnsqlite3_finalize(pStmt);

}

int main(){

constsqlite3_tokenizer_module *ptr =&cusTokenizerModule;

sqlite3*pDB;

sqlite3_stmt* stmt;

char *errMsg = NULL;

const char*zTail;

int rc =sqlite3_open("test.sqlite3", &pDB);

if(rc){

printf("create error. %s\n",sqlite3_errmsg(pDB));

return rc;

}

chartoken_name[] = "custoken";

registerTokenizer(pDB, token_name, ptr);

rc =sqlite3_exec(pDB, "CREATE VIRTUAL TABLE foo USINGfts3(tokenize=custoken)", 0, 0, &errMsg);     if(rc !=SQLITE_OK){        printf("create virtual error, %s\n", errMsg);     if(rc !=SQLITE_OK){        printf("create virtual error, %s\n", errMsg);        return rc;     }     rc =sqlite3_exec(pDB, "INSERT INTO fooVALUES('\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82')", 0, 0,&errMsg);     if(rc !=SQLITE_OK){        printf("insert value error, %s\n", errMsg);        return rc;     }     int nrow =0, ncolumn = 0;     char**azResult; //二维数组存放结果    sqlite3_get_table(pDB , "SELECT * FROM foo WHERE content MATCH'\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'" , &azResult, &nrow , &ncolumn ,&errMsg );     int i = 0;     printf("row:%d column=%d \n" , nrow , ncolumn );     printf("\nThe result of querying is : \n" );     for( i=0 ;i

你可能感兴趣的:(android,sqlite,分词)