字符串查找

最近需要改进一个字符串查找的算法。
我用了类似于KMP的算法。相比于一个一个比较。效率提高大概25倍

代码

#include 
#include 
const int maxNum = 1005;



char* genRandomString(int length)  
{  
    int flag, i;  
    char* string; 
    srand((unsigned) time(NULL ));  
    if ((string = (char*) malloc(length)) == NULL )  
    {  
        printf("Malloc failed!flag:14\n");  
        return NULL ;  
    }  
  
    for (i = 0; i < length - 1; i++)  
    {  
        flag = rand() % 3;  
        switch (flag)  
        {  
            case 0:  
                string[i] = 'A' + (rand() + length) % 26;  
                break;  
            case 1:  
                string[i] = 'a' + rand() % 26;  
                break;  
            case 2:  
                string[i] = '0' + (rand() + length) % 10;  
                break;  
            default:  
                string[i] = 'x';  
                break;  
        }  
    }  
    string[length - 1] = '\0';  
    return string;  
}

char* getString(char* src, int length)  
{  
    int flag, i;  
    char* string; 
    srand((unsigned) time(NULL ));  
    if ((string = (char*) malloc(length)) == NULL )  
    {  
        printf("Malloc failed!flag:14\n");  
        return NULL ;  
    }  
  
    int srcLen = strlen(src);
    int startNum =  rand() % (srcLen - length);
    
    memcpy(string , *(src + startNum) , length);
    string[length - 1] = '\0';  
    return string;  
}

int Sunday(char * sr, int srclen, char * ta, int targetlen)
{
    int shift[maxNum];
    int i = 0;
    char* src = sr;
    char* target = ta;
    // 默认值,移动m+1位
    for( i = 0; i < maxNum; i++) {
        shift[i] = targetlen + 1;
    }
    for( i = 0; i < targetlen; i++) {
        shift[target[i]] = targetlen - i;
    }

    // 模式串开始位置在主串的哪里
    int s = 0;
    // 模式串已经匹配到的位置
    int j;
    while(s <= srclen - targetlen) {
        j = 0;
        while(src[s + j] == target[j]) {
            j++;
            // 匹配成功
            if(j >= targetlen) {
                return s;
            }
        }
        s += shift[src[s + targetlen]];
    }
    return -1;
}

int atoi(char *s)
{
    int i = 0;
    int n = 0;
    for (i = 0; *s >= '0'&& *s <= '9' ; ++i)
    {
        n = 10 * n + (*s - '0');
    }
    return n;
}


int myMemmem(char * a, int alen, char * b, int blen)
{
    int i =0;
    int j =0;
    for ( i = 0; i <= (alen - blen); ++i)
    {
        for (j = 0; j < blen; ++ j)
        {
            if (a[i + j] != b[j])
            {
                break;
            }
        }
        if (j >= blen)
        {
            return i;
        }
    }
    return -1;
 }
/**
IN
at the thought of
though

OUT
7
**/
int main() {
    // 主串和模式串
    char* T;
    char* P;
    int a = 0;
    int b = 0;
    clock_t start;
    clock_t finish;
    double  duration;

    while(1) {
        // 获取一行
        printf("put T length\n");
        scanf("%d",&a);
        printf("put P length\n");
        scanf("%d",&b);
        
        T = genRandomString(a);
        P = genRandomString(b);
        // printf("--------------T = (%s)\n", T);
        // printf("--------------P = (%s)\n", P);
        start = clock(); 
        int res = Sunday(T, a-1, P, b-1);
        finish = clock(); 
        duration = (double)(finish - start) / CLOCKS_PER_SEC;  
        printf( "--Sunday time is %f seconds\n", duration );      
        if(res == -1) {
            printf("Sunday主串和模式串不匹配\n");
        } else {
            printf("Sunday模式串在主串的位置为:%d\n", res);
        }

        start = clock(); 
        res = myMemmem(T, a-1, P, b-1);
        finish = clock(); 
        duration = (double)(finish - start) / CLOCKS_PER_SEC;  
        printf( "--myMemmem time is %f seconds\n", duration ); 

        if(res == -1) {
            printf("myMemmem主串和模式串不匹配\n");
        } else {
            printf("myMemmem模式串在主串的位置为:%d\n", res);
        }
    free(T);
    free(P);
    }
    return 0;
}

后续
新的版本的memmem函数用的是高效率的算法,比KMP的快。

代码如下

/* Copyright (C) 1991-2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   .  */

/* This particular implementation was written by Eric Blake, 2008.  */

#ifndef _LIBC
# include 
#endif

/* Specification of memmem.  */
#include 

#ifndef _LIBC
# define __builtin_expect(expr, val)   (expr)
#endif

#define RETURN_TYPE void *
#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
#include "str-two-way.h"

#undef memmem

/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
   HAYSTACK.  */
void *
memmem (const void *haystack_start, size_t haystack_len,
    const void *needle_start, size_t needle_len)
{
  /* Abstract memory is considered to be an array of 'unsigned char' values,
     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
  const unsigned char *haystack = (const unsigned char *) haystack_start;
  const unsigned char *needle = (const unsigned char *) needle_start;

  if (needle_len == 0)
    /* The first occurrence of the empty string is deemed to occur at
       the beginning of the string.  */
    return (void *) haystack;

  /* Sanity check, otherwise the loop might search through the whole
     memory.  */
  if (__builtin_expect (haystack_len < needle_len, 0))
    return NULL;

  /* Use optimizations in memchr when possible, to reduce the search
     size of haystack using a linear algorithm with a smaller
     coefficient.  However, avoid memchr for long needles, since we
     can often achieve sublinear performance.  */
  if (needle_len < LONG_NEEDLE_THRESHOLD)
    {
      haystack = memchr (haystack, *needle, haystack_len);
      if (!haystack || __builtin_expect (needle_len == 1, 0))
    return (void *) haystack;
      haystack_len -= haystack - (const unsigned char *) haystack_start;
      if (haystack_len < needle_len)
    return NULL;
      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
    }
  else
    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
}
libc_hidden_def (memmem)

#undef LONG_NEEDLE_THRESHOLD

这个memmem函数用的是two_way_long_needle 的算法。已经是比较快的算法了。

你可能感兴趣的:(字符串查找)