该算法有助于查找所有给定关键字集的所有匹配项。它是一种字典匹配算法。它使用包含所有关键字的树结构。制作完树后,它将尝试将树转换为自动机以在线性时间内进行搜索。Aho-Corasick算法分为三个不同阶段。
这些是转到,失败和输出。在进入阶段,它将使用所有关键字来制作树。在下一阶段或失败阶段,它将尝试查找向后过渡,以获取某些关键字的适当后缀。在输出阶段,对于自动机的每个状态“ s”,它会找到所有以“ s”状态结尾的单词。
该算法的时间复杂度为:O(N + L + Z),其中N是文本的长度,L是关键字的长度,Z是匹配项的数量。
Input: A set of patterns: {their, there, answer, any, bye} The main string: “isthereanyanswerokgoodbye” Output: Word there location: 2 Word any location: 7 Word answer location: 10 Word bye location: 22
buildTree(patternList,size)
输入-所有模式的列表以及列表的大小
输出-生成过渡图以查找模式
Begin set all elements of output array to 0 set all elements of fail array to -1 set all elements of goto matrix to -1 state := 1 //at first there is only one state. for all patterns ‘i’ in the patternList, do word := patternList[i] present := 0 for all character ‘ch’ of word, do if goto[present, ch] = -1 then goto[present, ch] := state state := state + 1 present:= goto[present, ch] done output[present] := output[present] OR (shift left 1 for i times) done for all type of characters ch, do if goto[0, ch] ≠ 0 then fail[goto[0,ch]] := 0 insert goto[0, ch] into a Queue q. done while q is not empty, do newState := first element of q delete from q. for all possible character ch, do if goto[newState, ch] ≠ -1 then failure := fail[newState] while goto[failure, ch] = -1, do failure := goto[failure, ch] done fail[goto[newState, ch]] = failure output[goto[newState, ch]] :=output[goto[newState,ch]] OR output[failure] insert goto[newState, ch] into q. done done return state End
getNextState(presentState,nextChar)
输入-当前状态和下一个字符以确定下一个状态
输出:下一个状态
Begin answer := presentState ch := nextChar while goto[answer, ch] = -41, do answer := fail[answer] done return goto[answer, ch] End
patternSearch(patternList,size,text)
输入- 模式列表,列表大小和正文
输出-找到模式的文本的索引
Begin call buildTree(patternList, size) presentState := 0 for all indexes of the text, do if output[presentState] = 0 ignore next part and go for next iteration for all patterns in the patternList, do if the pattern found using output array, then print the location where pattern is present done done End
#include <iostream> #include <queue> #define MAXS 500 //sum of the length of all patterns #define MAXC 26 //as 26 letters in alphabet using namespace std; int output[MAXS]; int fail[MAXS]; int gotoMat[MAXS][MAXC]; int buildTree(string array[], int size) { for(int i = 0; i<MAXS; i++) output[i] = 0; //all element of output is 0 for(int i = 0; i<MAXS; i++) fail[i] = -1; //all element of failure array is -1 for(int i = 0; i<MAXS; i++) for(int j = 0; j<MAXC; j++) gotoMat[i][j] = -1; //all element of goto matrix is -1 int state = 1; //there is only one state for (int i = 0; i < size; i++) { //make trie structure for all pattern in array //const string &word = array[i]; string word = array[i]; int presentState = 0; for (int j = 0; j < word.size(); ++j) { //all pattern is added int ch = word[j] - 'a'; if (gotoMat[presentState][ch] == -1) //ic ch is not present make new node gotoMat[presentState][ch] = state++; //increase state presentState = gotoMat[presentState][ch]; } output[presentState] |= (1 << i); //current word added in the output } for (int ch = 0; ch < MAXC; ++ch) //if ch is not directly connected to root if (gotoMat[0][ch] == -1) gotoMat[0][ch] = 0; queue<int> q; for (int ch = 0; ch < MAXC; ++ch) { //node goes to previous state when fails if (gotoMat[0][ch] != 0) { fail[gotoMat[0][ch]] = 0; q.push(gotoMat[0][ch]); } } while (q.size()) { int state = q.front(); //remove front node q.pop(); for (int ch = 0; ch <= MAXC; ++ch) { if (gotoMat[state][ch] != -1) { //if goto state is present int failure = fail[state]; while (gotoMat[failure][ch] == -1) //find deepest node with proper suffix failure = fail[failure]; failure = gotoMat[failure][ch]; fail[gotoMat[state][ch]] = failure; output[gotoMat[state][ch]] |= output[failure]; // Merge output values q.push(gotoMat[state][ch]); //add next level node to the queue } } } return state; } int getNextState(int presentState, char nextChar) { int answer = presentState; int ch = nextChar - 'a'; //subtract ascii of 'a' while (gotoMat[answer][ch] == -1) //if go to is not found, use fail function answer = fail[answer]; return gotoMat[answer][ch]; } void patternSearch(string arr[], int size, string text) { buildTree(arr, size); //make the trie structure int presentState = 0; //make current state as 0 for (int i = 0; i < text.size(); i++) { //find all occurances of pattern presentState = getNextState(presentState, text[i]); if (output[presentState] == 0) //if no match continue; for (int j = 0; j < size; ++j) { //matching found and print words if (output[presentState] & (1 << j)) { cout << "Word " << arr[j] << " location: " << i - arr[j].size() + 1 << endl; } } } } int main() { string arr[] = {"their", "there", "answer", "any", "bye"}; string text = "isthereanyanswerokgoodbye"; int k = sizeof(arr)/sizeof(arr[0]); patternSearch(arr, k, text); return 0; }
输出结果
Word there location: 2 Word any location: 7 Word answer location: 10 Word bye location: 22