在这个问题中,我们得到了一个输入字符串和一个数组arr []。我们的任务是查找字符串中数组所有单词的所有匹配项。为此,我们将使用Aho-Corasick算法进行模式搜索。
字符串和模式搜索在编程中很重要。而且在编程中,算法越好,它可以具有更多的实际用途。Aho-Corasick算法是一种非常重要且功能强大的算法,可简化字符串搜索。这是一种字典匹配算法,可以同时匹配所有字符串。该算法将Trie数据结构用于其实现。
Trie是一种前缀树或数字搜索树,其中每个边缘都用某个字母标记(每个输出边缘具有不同的字母)。
让我们以一个例子来了解Aho-Corasick算法
输入值
string = "bheythisghisanexample" arr[] = {"hey", "this", "is", "an", “example”}
输出结果
Word hey starts from 2 Word this starts from 5 Word is starts from 11 Word an starts from 13 Word example starts from 15
该算法的时间复杂度为O(N + L + Z),其中N =字符串/文本输入的长度
L =关键字的长度(数组中的单词)
Z =比赛次数。
可以通过以下简单步骤构建Aho-Corasick算法
使用队列构造特里,以便我们可以将队列中的每个字符作为节点“特里”弹出。
将故障链接(后缀链接)构造为可以存储下一个和当前字符的数组
将输出链接构造为数组以存储匹配的单词
建立一个遍历函数(FindNextState)以检查所有字符。
失败链接(后缀链接) -当我们击中无法继续读取字符的字符串部分时,我们通过跟随后缀链接来回退,以尝试尽可能多地保留上下文。简而言之,当当前角色在Trie中没有边时,它将存储所有跟随的边。
输出链接-它始终指向与当前状态中存在的最长单词相对应的节点,我们确保使用输出链接将所有模式链接在一起。
#include<iostream> #include <string.h> #include<algorithm> #include<queue> using namespace std; const int MaxStates = 6 * 50 + 10; const int MaxChars = 26; int OccurenceOfWords[MaxStates]; int FF[MaxStates]; int GotoFunction[MaxStates][MaxChars]; int BuildMatchingMachine(const vector<string> &words, char lowestChar = 'a', char highestChar = 'z'){ memset(OccurenceOfWords, 0, sizeof OccurenceOfWords); memset(FF, -1, sizeof FF); memset(GotoFunction, -1, sizeof GotoFunction); int states = 1; for (int i = 0; i < words.size(); ++i){ const string &keyword = words[i]; int currentState = 0; for (int j = 0; j < keyword.size(); ++j){ int c = keyword[j] - lowestChar; if (GotoFunction[currentState][c] == -1){ GotoFunction[currentState][c] = states++; } currentState = GotoFunction[currentState][c]; } OccurenceOfWords[currentState] |= (1 << i); } for (int c = 0; c < MaxChars; ++c){ if (GotoFunction[0][c] == -1){ GotoFunction[0][c] = 0; } } queue<int> q; for (int c = 0; c <= highestChar - lowestChar; ++c){ if (GotoFunction[0][c] != -1 && GotoFunction[0][c] != 0){ FF[GotoFunction[0][c]] = 0; q.push(GotoFunction[0][c]); } } while (q.size()){ int state = q.front(); q.pop(); for (int c = 0; c <= highestChar - lowestChar; ++c){ if (GotoFunction[state][c] != -1){ int failure = FF[state]; while (GotoFunction[failure][c] == -1){ failure = FF[failure]; } failure = GotoFunction[failure][c]; FF[GotoFunction[state][c]] = failure; OccurenceOfWords[GotoFunction[state][c]] |= OccurenceOfWords[failure]; q.push(GotoFunction[state][c]); } } } return states; } int FindNextState(int currentState, char nextInput, char lowestChar = 'a'){ int answer = currentState; int c = nextInput - lowestChar; while (GotoFunction[answer][c] == -1){ answer = FF[answer]; } return GotoFunction[answer][c]; } vector<int> FindWordCount(string str, vector<string> keywords, char lowestChar = 'a', char highestChar = 'z') { BuildMatchingMachine(keywords, lowestChar, highestChar); int currentState = 0; vector<int> retVal; for (int i = 0; i < str.size(); ++i){ currentState = FindNextState(currentState, str[i], lowestChar); if (OccurenceOfWords[currentState] == 0) continue; for (int j = 0; j < keywords.size(); ++j){ if (OccurenceOfWords[currentState] & (1 << j)){ retVal.insert(retVal.begin(), i - keywords[j].size() + 1); } } } return retVal; } int main(){ vector<string> keywords; keywords.push_back("All"); keywords.push_back("she"); keywords.push_back("is"); string str = "Allisheall"; cout<<"The occurrences of all words in the string ' "<<str<<" ' are \n"; vector<int> states = FindWordCount(str, keywords); for(int i=0; i < keywords.size(); i++){ cout<<"Word "<<keywords.at(i)<<' '; cout<<"starts at "<<states.at(i)+1<<' '; cout<<"And ends at "<<states.at(i)+keywords.at(i).size()+1<<endl; } }
输出结果
The occurrences of all words in the string ' Allisheall ' are Word All starts at 5 And ends at 8 Word she starts at 4 And ends at 7 Word is starts at 1 And ends at 3