很久没有在csdn上写文章了,在这样那样的事情发生之后。最近用到了strtok这个函数,但是只支持ASCII编码,我想用来实现中文标点的分词,或者中文文字的字符串分割,于是只好自己写一个。不过我想strtok函数本身肯定不是那么简单的方法,因为最简单的方法复杂度很高,每一次读到一个ASCII串都需要匹配整个分隔符列表。
这样看来,更好的办法就是使用哈希函数,能够直接定位到你的分隔符。微软就是这么实现的。strtok函数源码中使用了 ,unsigned char map[32] 这个东西作为分隔符列表的映射。这个map一共有 32*8=256个映射,而ASCII编码总共只有128个字符,所以拿来做hash非常自然。源码如下:
#include <cruntime.h> #include <string.h> #ifdef _MT #include <mtdll.h> #endif /* _MT */ /*** *char *strtok(string, control) - tokenize string with delimiter in control * *Purpose: * strtok considers the string to consist of a sequence of zero or more * text tokens separated by spans of one or more control chars. the first * call, with string specified, returns a pointer to the first char of the * first token, and will write a null char into string immediately * following the returned token. subsequent calls with zero for the first * argument (string) will work thru the string until no tokens remain. the * control string may be different from call to call. when no tokens remain * in string a NULL pointer is returned. remember the control chars with a * bit map, one bit per ascii char. the null char is always a control char. * *Entry: * char *string - string to tokenize, or NULL to get next token * char *control - string of characters to use as delimiters * *Exit: * returns pointer to first token in string, or if string * was NULL, to next token * returns NULL when no more tokens remain. * *Uses: * *Exceptions: * *******************************************************************************/ char * __cdecl strtok(char * string, const char * control) { unsigned char *str; const unsigned char *ctrl = control; unsigned char map[32]; int count; #ifdef _MT _ptiddata ptd = _getptd(); #else /* _MT */ static char *nextoken; #endif /* _MT */ /* Clear control map */ for (count = 0; count < 32; count++) map[count] = 0; /* Set bits in delimiter table */ do { map[*ctrl >> 3] |= (1 << (*ctrl & 7)); } while (*ctrl++); /* Initialize str. If string is NULL, set str to the saved * pointer (i.e., continue breaking tokens out of the string * from the last strtok call) */ if (string) str = string; else #ifdef _MT str = ptd->_token; #else /* _MT */ str = nextoken; #endif /* _MT */ /* Find beginning of token (skip over leading delimiters). Note that * there is no token iff this loop sets str to point to the terminal * null (*str == '/0') */ while ((map[*str >> 3] & (1 << (*str & 7))) && *str) str++; string = str; /* Find the end of the token. If it is not the end of the string, * put a null there. */ for (; *str; str++) if (map[*str >> 3] & (1 << (*str & 7))) { *str++ = '/0'; break; } /* Update nextoken (or the corresponding field in the per-thread data * structure */ #ifdef _MT ptd->_token = str; #else /* _MT */ nextoken = str; #endif /* _MT */ /* Determine if a token has been found. */ if (string == str) return NULL; else return string; }
do { map[*ctrl >> 3] |= (1 << (*ctrl & 7)); } while (*ctrl++);这个地方做了hash映射。
while ((map[*str >> 3] & (1 << (*str & 7))) && *str) str++;
for (; *str; str++) if (map[*str >> 3] & (1 << (*str & 7))) { *str++ = '/0'; break; }
其他部分中MT是有关线程安全的宏,在函数中使用了一个static指针作为下一次取内容的地址,这样是线程不安全的,不过不管他目前。
下面是我修改的针对中文字符有效的函数:
/** * @describe just like strtok of Microsoft * and the usage is the Same * if u wanna split Chinese words * GBK should be used. */ char* cstrtok(char* string, const char* control) { /* 在这里加一个assert,如果中文GBK被截断,那就不往下走了 */ unsigned char *str; const unsigned char *ctrl = (const unsigned char*) control; do { if (*ctrl >> 7) { if (*(ctrl + 1) == '\0') { fprintf(stderr, "Some weird symbol occurred in delimiter\n" "Maybe Chinese word GBK have been truncated.\n"); return NULL; } else { ctrl++; } } } while (*ctrl++); ctrl = (const unsigned char*) control; unsigned char map[32]; unsigned char vice_map[32]; int count; #ifdef _MT _ptiddata ptd = _getptd(); #else /* _MT */ static char *nextoken; #endif /* _MT */ /* Clear control map */ for (count = 0; count < 32; count++) { map[count] = 0; vice_map[count] = 0; } /* Set bits in delimiter table */ /* 这里很可能出问题,当中文GBK编码被截断的时候*/ do { map[*ctrl >> 3] |= (1 << (*ctrl & 7)); if ((*(ctrl + 1)) && (*ctrl >> 7)) { vice_map[*(ctrl + 1) >> 3] |= (1 << (*(ctrl + 1) & 7)); ctrl++; } } while (*ctrl++); /* Initialize str. If string is NULL, set str to the saved * pointer (i.e., continue breaking tokens out of the string * from the last strtok call) */ if (string) str = (unsigned char*) string; else #ifdef _MT str = ptd->_token; #else /* _MT */ str = (unsigned char*) nextoken; #endif /* _MT */ /* Find beginning of token (skip over leading delimiters). Note that * there is no token iff this loop sets str to point to the terminal * null (*str == '/0') */ while ((map[*str >> 3] & (1 << (*str & 7))) && *str) { if (*str >> 7) { if (vice_map[*(str + 1) >> 3] & (1 << (*(str + 1) & 7))) { str += 2; } else { break; } } else { str++; } } string = (char*) str; /* Find the end of the token. If it is not the end of the string, * put a null there. */ for (; *str; str++) if (map[*str >> 3] & (1 << (*str & 7))) { if (*str >> 7) { if (vice_map[*(str + 1) >> 3] & (1 << (*(str + 1) & 7))) { *str++ = '\0'; *str++ = '\0'; } else { continue; } } else { *str++ = '\0'; } break; } /* Update nextoken (or the corresponding field in the per-thread data * structure */ #ifdef _MT ptd->_token = str; #else /* _MT */ nextoken = (char*) str; #endif /* _MT */ /* Determine if a token has been found. */ if (string == (char*) str) return NULL; else return string; }
应该可以再优化,过段时间再看看。
今天天气不好,下了中雨,在这个炎热的夏天里应该是好事,但是加班的孩子坐在这里还是很冷的,虽是周六,但是空调很足。
外面天嘿嘿,白天下成了黑夜。
不做梦了,赶紧看另一个RLSI程序,最近最好能出论文