Merge pull request wangzheng0822#185 from jerryderry/bm-python

Boyer-Moore string-search algorithm in python
quinnchenk · Dec 12, 2018 · d869727 · d869727
2 parents fb0c353 + 68b5c02
commit d869727
Showing 1 changed file with 77 additions and 0 deletions.
diff --git a/python/33_bm/bm.py b/python/33_bm/bm.py
@@ -0,0 +1,77 @@
+"""
+    Boyer-Moore string-search algorithm.
+
+    Author: Wenru Dong
+"""
+
+from typing import List, Tuple
+
+SIZE = 256
+
+def _generate_bad_character_table(pattern: str) -> List[int]:
+    bc = [-1] * SIZE
+    for i, char in enumerate(pattern):
+        bc[ord(char)] = i
+    return bc
+
+
+def _generate_good_suffix_table(pattern: str) -> Tuple[List[bool], List[int]]:
+    m = len(pattern)
+    # prefix[k] records whether the last k-character suffix of pattern
+    # can match with the first k-character prefix of pattern.
+    # suffix[k] records the starting index of the last substring of
+    # pattern that can match with the last k-character suffix of pattern.
+    prefix, suffix = [False] * m, [-1] * m
+    # For each substring patter[:i+1], we find the common suffix with
+    # pattern, and the starting index of this common suffix.
+    # This way we can re-write previous suffix[k] to record the index
+    # as large as possible, hence the last substring.
+    for i in range(m - 1):
+        j = i  # starting index of the common suffix
+        k = 0  # length of the common suffix
+        while j >= 0 and pattern[j] == pattern[~k]:
+            j -= 1
+            k += 1
+            suffix[k] = j + 1
+        if j == -1: prefix[k] = True
+    return (prefix, suffix)
+
+
+def _move_by_good_suffix(bad_character_index: int, suffix: List[int], prefix: List[bool]) -> int:
+    k = len(suffix) - 1 - bad_character_index
+    if suffix[k] != -1: return bad_character_index - suffix[k] + 1
+    # Test from k - 1
+    for r, can_match_prefix in enumerate(reversed(prefix[:k]), bad_character_index + 2): 
+        if can_match_prefix: return r
+    return len(suffix)
+
+
+def bm(s: str, pattern: str) -> int:
+    bc = _generate_bad_character_table(pattern)
+    prefix, suffix = _generate_good_suffix_table(pattern)
+    n, m = len(s), len(pattern)
+    i = 0
+    while i <= n - m:
+        j = m - 1  # bad character index in pattern
+        while j >= 0: 
+            if s[i + j] != pattern[j]: break
+            j -= 1
+        if j < 0: return i
+
+        x = j - bc[ord(s[i + j])]
+        y = 0
+        if j < m - 1:
+            y = _move_by_good_suffix(j, suffix, prefix)
+        i += max(x, y)
+    return -1
+
+
+if __name__ == "__main__":
+
+    s = "Here is a simple example"
+    pattern = "example"
+    print(bm(s, pattern))
+
+    s = "abcdcccdc"
+    pattern = "cccd"
+    print(s.find(pattern) == bm(s, pattern))