//前三个函数的Sunday算法
int Search(wchar *str, wchar nextchar)
{
int length = wstrlen(str);
while(length-- >0)
{
if(nextchar == str[length - 1])
return length -1;
}
return -1;
}
int Jud(wchar * str1 , int n1, wchar *str2 , int n2 )
{
int n2_temp = n2 - n1;
int length = wstrlen(str1);
int i = 0;
if(length == 0) return 0;
while(length-- > 0)
{
if(str1[i++] != str2[n2_temp++]) break;
}
if(length == -1)
return 1;
return 0;
}
int Select(wchar * str1, wchar *str2 )
{
int n_temp = 0;
int n1 = 0;
int n2 = 0;
wchar nextchar;
if(str2 == 0)
return 0;
while(1)
{ if(str2[n2+1] == '\0')
return 0;
if(Jud(str1,n1,str2,n2) == 1) return 1;
if(n_temp == 0)
{
n_temp = wstrlen(str1);
n2 = n_temp;
}
nextchar = str2[++n2];
while(str2[n2] != '\0')
{
if((n1 = Search(str1,nextchar)) >= 0) break;
else
{
n2++;
nextchar = str2[n2];
}
if(nextchar == '\0')
return 0;
}
}
}
/****************************************
是否成词的判断函数(基于双数组)
****************************************/
int isword(wchar *str)
{
int shift(0);
int tok(0);
int mark(0);
wchar *pwstr = str;
int baseNdx = getIndex(*pwstr);
while(*pwstr){
shift = base[baseNdx]; tok = getIndex(*(++pwstr));
if(check[shift+tok] == baseNdx){
if(check[base[shift + tok] + 1] == shift + tok && base[base[shift+tok]+1] == -1){
if(!(*(++pwstr))){
mark = 1; break;
}
pwstr--;
}
}
else{
mark = 0;
break;
}
baseNdx = shift+tok;
mark = 0;
}
if(!(*pwstr ) && mark == 1 )
return 1;
else
return 0;
}
void wordcopy_pre(wchar *str_p,wchar *str,int i)
{
int len = i;
wchar *pstr_p = str_p;
wchar *pstr = str;
int index(0);
int count(0);
while(index++ < len){
//count++;
*pstr_p++ = *pstr++;
}
}
void wordcopy_suf(wchar *str_suf,wchar *str,int i)
{
int index(0);
int len = i;
wchar *pstr = str;
wchar *pstr_suf = str_suf;
while(index++ < len)
*pstr++;
while(*pstr){
*pstr_suf++ = *pstr++;
}
}
/************************************
转存函数
************************************/
void wordCat(wchar *word,wchar *p_re)
{
while(*p_re)
p_re++;
while(*word){
*p_re++ = *word++;
}
/*if(*(--p_re) != 0x0020){ //逆向最大匹配时使用
p_re++;
*p_re++ = 0x0020;
*p_re ='\0';
}
else
p_re++;*/
*p_re++ = 0x0020;
p_re = '\0';
}
int wordCount(0);
/*****************************************
复合词判断函数
*****************************************/
int isCom(wchar *str,int word_len)
{
wordCount++; // 全局变量,初始值为0
int word_length = word_len;
if(str == 0) return 1;
if(wstrlen(str) == 0) return 1;
if(wstrlen(str) < 2) return 0;
int len = wstrlen(str);
int len_count(1);
wchar *str_pre = (wchar *)calloc(sizeof(wchar),32);
int i = 2;
while(i < len + 1){ //i没有自加
len_count++;
if(len_count == word_length)
return 0;
wchar *str_suf = (wchar *)calloc(sizeof(wchar),len - i + 1);
wordcopy_pre(str_pre,str,i);
wordcopy_suf(str_suf,str,i);
str_pre[i] ='\0';
if(isword(str_pre)&&isCom(str_suf,word_length))
return 1;
wordCount = 0;
i++;
free(str_suf);str_suf =NULL;
}
free(str_pre); str_pre = NULL;
if((i == len + 1) && wordCount == 0)
return 0;
}
/*************************************
把复合词分解成元素词
*************************************/
int cut(wchar *str,wchar *pstr)
{
wchar *p_re = pstr;
if(str == 0) return 1; //如果字符串为空,则返回真
if(wstrlen(str) == 0) return 1; //如果词长为零,则返回真
if(wstrlen(str) < 2) return 0; //如果词长小于2,则返回假
int len = wstrlen(str);
int len_count(1);
int isComMark = 0;
int i = 2;
while(i <= len ){ //i没有自加
wchar *str_pre = (wchar *)calloc(sizeof(wchar),32);
wchar *str_suf = (wchar *)calloc(sizeof(wchar),len - i + 1);
wordcopy_pre(str_pre,str,i);
wordcopy_suf(str_suf,str,i);
str_pre[i] ='\0';
if(isword(str_pre)&&cut(str_suf,p_re)) {
isComMark = 1; //前缀为词,后缀可分,则isComMark = 1
if(!Select(str_pre,p_re)) // 如果p_re中没有这个词,则把这个词存入p_re
wordCat(str_pre,p_re);
}
free(str_suf);
free(str_pre);
i++;
}
if(i > len && isComMark == 1) //如果循环结束,且循环中有任意一个子串可分,则返回真
return 1;
else
return 0;
}
/**********************************
复合词拆分主函数
**********************************/
wchar * cutComToEle(wchar *Str)
{
int len_Str = wstrlen(Str);
wchar *p_re = (wchar *)calloc(sizeof(wchar),len_Str*6);
wchar *saveComEle = (wchar *)calloc(sizeof(wchar),200);
//wchar *Arr[20];
int count_re(0);
int isComword(0);
wchar *str_temp = Str;
wchar *str = (wchar *)calloc(sizeof(wchar),32);
while(*str_temp){
while((*str++ = *str_temp++) != 0x0020)
count_re++;
*(--str) = '\0'; //后退到空格,并用'\0'填充,做结束标志。
while(count_re-- > 1)
str--;
str--; //回退到词首
wchar *word = str;
int len = wstrlen(word);
if (len < 4)
isComword = 0;
else if(isCom(word,len)){ // 判断是否是复合词
isComword = 1; //如果是复合词,则isComword = 1
}
if(isComword == 1){ //如果是复合词
cut(word,saveComEle); //把复合词拆分成元素词,结果存到saveComEle中
wordCat(saveComEle,p_re); //把saveComEle中的结果存入p_re中
isComword = 0; // isComword 置零
memset(saveComEle,0,sizeof(wchar)*200); // saveComEle 清空
}
else
wordCat(word,p_re); //如果不是复合词,直接把词写入p_re
}
int len_re = wstrlen(p_re);
memset(Str,0,sizeof(wchar)*len_re);
wstrncpy(Str, p_re, len_re);
free(p_re);p_re = NULL;
free(str); str = NULL;
free(saveComEle);saveComEle = NULL;
return p_re;
}