Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

后缀自动机 #18

Open
RubyLouvre opened this issue Sep 11, 2019 · 5 comments
Open

后缀自动机 #18

RubyLouvre opened this issue Sep 11, 2019 · 5 comments

Comments

@RubyLouvre
Copy link
Owner

function Node(key) {
            this.children = {};
            this.len = 0; //最长子串的长度(该节点的子串数量=this.len - this.fail.len)
            this.isEnd = false;
            this.num = 0; // 该状态子串的数量
            this.key = key;
            this.count = 0; //被后缀链接的个数,方便求节点字符串的个数
            this.fail = null;
        }

        class SuffixAutomaton {
            constructor() {
                var node = (this.root = this.last = new Node(""));
                this.nodes = [node];
            }
            extend(word) {
                for (var i = 0; i < word.length; i++) {
                    this.insert(word[i]);
                }
            }
            insert(c, n) {
                var p = this.last;
                var node = new Node(c);
                node.len = p.len + 1;
                node.num = 1;
                this.last = node;
                this.nodes.push(node);
                while (p && !p.children[c]) {
                    //如果p 没有一条 c 的出边
                    p.children[c] = node; //为a为一个c
                    p = p.fail;
                }
                if (!p) {
                    node.fail = this.root;
                    this.root.count++;
                } else {
                    var q = p.children[c];
                    if (p.len + 1 == q.len) {
                        node.fail = q;
                        q.count++;
                        q.isEnd = true; //主轴上冲突
                    } else {
                        var clone = new Node(c); //克隆节点
                        this.nodes.push(clone);
                        clone.children = q.children;
                        clone.fail = q.fail;
                        clone.len = p.len + 1;
                        clone.isEnd = true;
                        while (p && p.children[c] == q) {
                            p.children[c] = clone; //将树中所有q相关的链接替换成clone
                            p = p.fail;
                        }
                        clone.count += 2;
                        q.fail = node.fail = clone;
                    }
                }
            }
        }
@RubyLouvre
Copy link
Owner Author

求多个字符串的最长公共子串(方法1)

function getLCS(strs) {
    var s = strs.shift();
    var sam = new SuffixAutomaton();
    sam.extend(s);
    var nodes = sam.nodes;
    for (var i = 1; i < nodes.length; i++) {
        var node = nodes[i]
        node.nlcs = node.len; //表示多个串的最小值
        node.lcs = 0 //表示当前匹配的最大值
    }
    //根据节点的len属性进行计数排序
    var count = [];
    for (var i = 0; i <= s.length; i++) {
        count[i] = 0;
    }
    for (var i = 0; i < nodes.length; i++) {
        count[nodes[i].len]++;
    }
    for (var i = 1; i <= s.length; i++) {
        count[i] += count[i - 1];//求前缀和
    }
    var sa = [];
    for (var i = 0; i < nodes.length; i++) {
        var index = --count[nodes[i].len]
        sa[index] = nodes[i];
    }
    function compute(str) {
        var f = []
        var i = 0, len = 0, n = str.length, node = sam.root
        while (i < n) {
            var c = str[i];
            if (node.children[c]) {
                len++;
                node = node.children[c]
            } else {
                while (node && !node.children[c]) {
                    node = node.fail
                }
                if (!node) {
                    len = 0
                    node = sam.root;
                } else {
                    len = node.len + 1;
                    node = node.children[c]
                }
            }
            i++
            node.lcs = Math.max(node.lcs, len)
        }
        for (var i = sa.length - 1; i >= 0; i--) {
            var node = sa[i];
            node.nlcs = Math.min(node.nlcs, node.lcs)
            if (node.fail && node.fail.lcs < node.lcs) {
                node.fail.lcs = node.lcs
            }
            node.lcs = 0
        }

    }
    var ans = 0;
    while (str = strs.shift()) {
        compute(str)
    }
    for (var i = 1; i < nodes.length; i++) {
        ans = Math.max(ans, nodes[i].nlcs)
    }
    console.log(count, sa);//only test
    return ans
}
console.log(getLCS(["uabcdefo", "dabcdui", "eabc87eew"])); //abc

@RubyLouvre
Copy link
Owner Author

RubyLouvre commented Sep 12, 2019

求K小子串

 function findK(str, k, op) {
            //op为0则表示不同位置的相同子串算作一个(不重复子串)。po=1则表示不同位置的相同子串算作多个(重复子串)
            var sam = new SuffixAutomaton();
            sam.extend(str);
            var nodes = sam.nodes;
            var total = nodes.length;
            var indegree = new Array(total).fill(0);
            var rank = [];
            /*
            我们令cnt[i]为i这个状态在原串中出现的次数。再在每个状态维护一个size。
            若T=0,siz[i[siz[i[表示ii的所有儿子的size之和+1(子树大小),
            否则siz[i[siz[i[表示ii的所有儿子的sizsiz之和+cnt[i](类似子树大小的东西)。
            接下来我们用size像平衡树一样查询就好了。 
————————————————*/

            for (var i = 0; i < total; i++) {
                nodes[i].endposCount = 0;
                nodes[i].sum = 0;
            }
            if (op) {
                var p = nodes[0];
                for (var i = 0; i < str.length; i++) {
                    p = p.children[str[i]];
                    p.endposCount++;
                }
            } else {
                for (var i = 0; i < total; i++) {
                    nodes[i].endposCount = 1;
                }
            }

            //计数排序
            for (var i = 0; i < total; i++) {
                indegree[nodes[i].len]++;// 统计相同度数的节点的个数
            }
            for (var i = 1; i < total; i++) {
                indegree[i] += indegree[i - 1];// 统计度数小于等于 i 的节点的总数
            }

            for (var i = 0; i < total; i++) {
                rank[--indegree[nodes[i].len]] = nodes[i];// 为每个节点编号,节点度数越大编号越靠后
            }

            if (op) {
                for (var i = total - 1; i >= 0; i--) {
                    if (rank[i].fail) {
                        rank[i].fail.endposCount += rank[i].endposCount;
                    }
                }
            }

            console.log(rank, "rank");
            // console.log(pos)
            for (var i = total - 1; i >= 0; i--) {
                var node = rank[i];
                node.sum += node.endposCount;
                for (var c in node.children) {
                    node.sum += node.children[c].sum;
                }
            }
            var res = 0;
            var root = nodes[0];
            for (var c in root.children) {
                res += root.children[c].sum;
            }
            if (res < k) {
                console.log(-1, nodes);
                return -1;
            }
            var ok = false,
                ans = [];
            function dfs(node, k) {
                if (ok) {
                    return;
                }
                if (k <= 0) {
                    ok = 1;
                    return;
                }
                for (var c in node.children) {
                    var child = node.children[c];
                    if (ok) {
                        break;
                    }
                    if (k > child.sum) {
                        k -= child.sum;
                    } else {
                        k -= child.endposCount;
                        ans.push(c);
                        dfs(child, k);
                    }
                }
            }
            dfs(root, k);
            if (ok) {
                return ans.join("");
            } else {
                return -1;
            }
        }
        // https://blog.csdn.net/blankcqk/article/details/47337959
        // https://www.cnblogs.com/shuaihui520/p/10695207.html
        console.log(findK("aaa", 2, 0));
        console.log(findK("aabc", 3, 0));

@RubyLouvre
Copy link
Owner Author

@RubyLouvre
Copy link
Owner Author

RubyLouvre commented Sep 14, 2019

function findK(str, k, op) {
            //op为0则表示不同位置的相同子串算作一个(不重复子串)。po=1则表示不同位置的相同子串算作多个(重复子串)
            var sam = new SuffixAutomaton();
            sam.extend(str);
            var nodes = sam.nodes;
            var total = nodes.length;


            //计数排序
            var indegree = new Array(total).fill(0);
            var rank = [];
            for (var i = 0; i < total; i++) {
                nodes[i].index = i;
                indegree[nodes[i].len]++;// 统计相同度数的节点的个数
            }
            for (var i = 1; i < total; i++) {
                indegree[i] += indegree[i - 1];// 统计度数小于等于 i 的节点的总数
            }

            for (var i = 0; i < total; i++) {
                rank[--indegree[nodes[i].len]] = nodes[i]// 为每个节点编号,节点度数越大编号越靠后
            }

            var endpos = new Array(total).fill(1), sum = [0]
            for (var i = total - 1; i >= 1; i--) {
                var node = rank[i]
                if (op === 1) {
                    endpos[node.fail.index] += endpos[node.index]
                }
            }
            endpos[0] = 0;
            for (var i = total - 1; i >= 1; i--) {
                var node = rank[i];
                var index = node.index
                sum[index] = endpos[index]
                for (var c in node.children) {
                    sum[index] += sum[node.children[c].index];
                }
            }



            var ok = false,
                ans = [];
            function dfs(node, k) {
                if (k <= 0) {
                    ok = true
                    return
                }
                for (var c in node.children) {
                    var child = node.children[c];
                    if (ok) {
                        break
                    }
                    if (k > sum[child.index]) {
                        k -= sum[child.index]
                    } else {
                        k -= endpos[child.index];
                        ans.push(c);
                        dfs(child, k);
                    }
                }
            }
            dfs(sam.root, k);
            return ok ? ans.join("") : -1
        }
        // https://blog.csdn.net/blankcqk/article/details/47337959
        // https://www.cnblogs.com/shuaihui520/p/10695207.html
        console.log(findK("aaa", 2, 0));
        console.log(findK("aabc", 3, 0));

@RubyLouvre
Copy link
Owner Author

RubyLouvre commented Sep 18, 2019

统计一个字符串P在另一个字符串T中出现了多少 次

如果在创建后缀自动机时,为狀态添加一个count属性, 默认为1, 当它添加一个失配指针(不为root)加1 或 对它的克隆对象 的count+1 做了改动,可以简化成这样

  function Node(key) {
            this.children = {};
            this.len = 0; //最长子串的长度(该节点的子串数量=this.len - this.fail.len)
            this.isEnd = false;
            this.key = key;
            this.count = 0
            this.fail = null;
        }

        class SuffixAutomaton {
            constructor() {
                var node = (this.root = this.last = new Node(""));
                this.nodes = [node];
            }
            extend(word) {
                for (var i = 0; i < word.length; i++) {
                    this.insert(word[i]);
                }
            }
            insert(c, n) {
                var p = this.last;
                var node = new Node(c);
                node.len = p.len + 1;
                this.last = node;
                this.nodes.push(node);
                node.count++
                while (p && !p.children[c]) {
                    //如果p 没有一条 c 的出边
                    p.children[c] = node; //为a为一个c
                    p = p.fail;
                }

                if (!p) {
                    node.fail = this.root;
                } else {
                    var q = p.children[c];
                    if (p.len + 1 == q.len) {
                        node.fail = q;
                        q.count++
                        q.isEnd = true; //主轴上冲突
                    } else {
                        var clone = new Node(c); //克隆节点
                        clone.count = q.count + 1
                        this.nodes.push(clone);
                        clone.children = q.children;
                        clone.fail = q.fail;
                        clone.len = p.len + 1;
                        clone.isEnd = true;
                        while (p && p.children[c] == q) {
                            p.children[c] = clone; //将树中所有q相关的链接替换成clone
                            p = p.fail;
                        }
                        q.fail = node.fail = clone;
                    }
                }
            }
        }
        function timeOf(t, p) {
            var sam = new SuffixAutomaton()
            sam.extend(t);
            var node = sam.root, time = Infinity;
            for (var i = 0; i < p.length; i++) {
                var c = p[i];
                var node = node.children[c];
                if (node) {
                    time = Math.min(time, node.count)
                } else {
                    time = 0;
                }
            }
            var res = time === Infinity ? 0 : time
            console.log(`${p}${t}中出现了${res}次`)
            return res;
        }

        console.log(timeOf('abcdabcabasab', 'abc'))
        console.log(timeOf('abcdabcabasab', 'ab'))
        console.log(timeOf('abcdabcabasab', 'da'))
每个狀态对应的字符肯定会出现一次,当出现失配时,它会重新指向第一次匹配的同字符的狀态上,因此我们把这个数字加1。圆圈中的数字就是字符出现的次数了。但由于我们并没有在构建自动机时处理出现次数,我们需要曲线求国,通过拓朴排序来求。
```javascript
function topoSort(nodes) {
    var total = nodes.length;
    var indegree = new Array(total).fill(0);
    //统计每个字符出现的次数
    var counts = new Array(total).fill(1);
    var rank = [];
    for (var i = 0; i < total; i++) {
        nodes[i].index = i;//index用于关联counts数组
        indegree[nodes[i].len]++;// 将endpos相同的节点放在同一个桶中
    }
    for (var i = 1; i < total; i++) {
        indegree[i] += indegree[i - 1];// 统计endpos小于等于 i 的节点的总数
    }
    for (var i = 1; i < total; i++) {
        rank[--indegree[nodes[i].len]] = nodes[i]// 为每个节点编号,节点度数越大编号越靠后
    }

    for (var i = total - 1; i >= 1; i--) {
        var node = rank[i] //求得每个字符的出现次数
        counts[node.fail.index] += counts[node.index]
    }
    return counts
}
function timeOf(t, p) {
    var sam = new SuffixAutomaton()
    sam.extend(t);

    var counts = topoSort(sam.nodes)
    var node = sam.root, time = Infinity;
    for (var i = 0; i < p.length; i++) {
        var c = p[i];
        var node = node.children[c];
        if (node) {
            time = Math.min(time, counts[node.index])
        }
    }
    var res = time === Infinity ? 0 : time
    console.log(`${p}在${t}中出现了${res}`); // only test
    return res;
}

console.log(timeOf('abcdababc', 'abc'))
console.log(timeOf('abcdababc', 'ab'))
console.log(timeOf('abcdababc', 'da'))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant