跳转至

子串查找算法

leetcode的题目, 实现strstr,即查找子字符串

最简单的算法

int strStr(char * haystack, char * needle){
    char *tmp = haystack;
    while(*tmp != 0){
        if(strncmp(tmp, needle,strlen(needle))==0){
            return tmp - haystack;
        }
        tmp ++;
    }
    return -1;
}

kmp 算法

// C++ program for implementation of KMP pattern searching 
// algorithm 
#include <bits/stdc++.h> 

void computeLPSArray(char* pat, int M, int* lps); 

// Prints occurrences of txt[] in pat[] 
void KMPSearch(char* pat, char* txt) 
{ 
    int M = strlen(pat); 
    int N = strlen(txt); 

    // create lps[] that will hold the longest prefix suffix 
    // values for pattern 
    int lps[M]; 

    // Preprocess the pattern (calculate lps[] array) 
    computeLPSArray(pat, M, lps); 

    int i = 0; // index for txt[] 
    int j = 0; // index for pat[] 
    while (i < N) { 
        if (pat[j] == txt[i]) { 
            j++; 
            i++; 
        } 

        if (j == M) { 
            printf("Found pattern at index %d ", i - j); 
            j = lps[j - 1]; 
        } 

        // mismatch after j matches 
        else if (i < N && pat[j] != txt[i]) { 
            // Do not match lps[0..lps[j-1]] characters, 
            // they will match anyway 
            if (j != 0) 
                j = lps[j - 1]; 
            else
                i = i + 1; 
        } 
    } 
} 

// Fills lps[] for given patttern pat[0..M-1] 
void computeLPSArray(char* pat, int M, int* lps) 
{ 
    // length of the previous longest prefix suffix 
    int len = 0; 

    lps[0] = 0; // lps[0] is always 0 

    // the loop calculates lps[i] for i = 1 to M-1 
    int i = 1; 
    while (i < M) { 
        if (pat[i] == pat[len]) { 
            len++; 
            lps[i] = len; 
            i++; 
        } 
        else // (pat[i] != pat[len]) 
        { 
            // This is tricky. Consider the example. 
            // AAACAAAA and i = 7. The idea is similar 
            // to search step. 
            if (len != 0) { 
                len = lps[len - 1]; 

                // Also, note that we do not increment 
                // i here 
            } 
            else // if (len == 0) 
            { 
                lps[i] = 0; 
                i++; 
            } 
        } 
    } 
} 

// Driver program to test above function 
int main() 
{ 
    char txt[] = "ABABDABACDABABCABAB"; 
    char pat[] = "ABABCABAB"; 
    KMPSearch(pat, txt); 
    return 0; 
} 

使用哈希查找和strstr速度比较

hash在查找字符串时,每次偏移1个字节,并计算hash, 为了不重复计算重叠部分,有个算法'Karp-Rabin with Rolling Hash'。 但是速度都不如glibc的 strstr, 因为这里的hash函数没有用到硬件加速,但是strstr用了。

#define BASE 256         // Alphabet size (ASCII)
#define MOD 101          // A prime number to avoid overflow

// Rolling hash version of strstr()
const char* rabin_karp_strstr(const char* haystack, const char* needle) {
    int n = strlen(haystack);
    int m = strlen(needle);

    if (m == 0) return haystack;
    if (m > n) return NULL;

    int i;
    int hash_needle = 0;  // Hash for needle
    int hash_window = 0;  // Hash for current window in haystack
    int h = 1;

    // The value of h is BASE^(m-1) % MOD
    for (i = 0; i < m - 1; i++)
        h = (h * BASE) % MOD;

    // Calculate the hash value of needle and first window
    for (i = 0; i < m; i++) {
        hash_needle  = (BASE * hash_needle + needle[i]) % MOD;
        hash_window = (BASE * hash_window + haystack[i]) % MOD;
    }

    // Slide the pattern over text
    for (i = 0; i <= n - m; i++) {
        if (hash_needle == hash_window) {
            // Verify characters to avoid false positive
            if (strncmp(&haystack[i], needle, m) == 0)
                return &haystack[i];
        }

        // Calculate hash for next window
        if (i < n - m) {
            hash_window = (BASE * (hash_window - haystack[i] * h) + haystack[i + m]) % MOD;
            if (hash_window < 0)
                hash_window += MOD;  // Ensure non-negative
        }
    }

    return NULL;
}