forked from wangzheng0822/algo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request wangzheng0822#185 from jerryderry/bm-python
Boyer-Moore string-search algorithm in python
- Loading branch information
Showing
1 changed file
with
77 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
""" | ||
Boyer-Moore string-search algorithm. | ||
Author: Wenru Dong | ||
""" | ||
|
||
from typing import List, Tuple | ||
|
||
SIZE = 256 | ||
|
||
def _generate_bad_character_table(pattern: str) -> List[int]: | ||
bc = [-1] * SIZE | ||
for i, char in enumerate(pattern): | ||
bc[ord(char)] = i | ||
return bc | ||
|
||
|
||
def _generate_good_suffix_table(pattern: str) -> Tuple[List[bool], List[int]]: | ||
m = len(pattern) | ||
# prefix[k] records whether the last k-character suffix of pattern | ||
# can match with the first k-character prefix of pattern. | ||
# suffix[k] records the starting index of the last substring of | ||
# pattern that can match with the last k-character suffix of pattern. | ||
prefix, suffix = [False] * m, [-1] * m | ||
# For each substring patter[:i+1], we find the common suffix with | ||
# pattern, and the starting index of this common suffix. | ||
# This way we can re-write previous suffix[k] to record the index | ||
# as large as possible, hence the last substring. | ||
for i in range(m - 1): | ||
j = i # starting index of the common suffix | ||
k = 0 # length of the common suffix | ||
while j >= 0 and pattern[j] == pattern[~k]: | ||
j -= 1 | ||
k += 1 | ||
suffix[k] = j + 1 | ||
if j == -1: prefix[k] = True | ||
return (prefix, suffix) | ||
|
||
|
||
def _move_by_good_suffix(bad_character_index: int, suffix: List[int], prefix: List[bool]) -> int: | ||
k = len(suffix) - 1 - bad_character_index | ||
if suffix[k] != -1: return bad_character_index - suffix[k] + 1 | ||
# Test from k - 1 | ||
for r, can_match_prefix in enumerate(reversed(prefix[:k]), bad_character_index + 2): | ||
if can_match_prefix: return r | ||
return len(suffix) | ||
|
||
|
||
def bm(s: str, pattern: str) -> int: | ||
bc = _generate_bad_character_table(pattern) | ||
prefix, suffix = _generate_good_suffix_table(pattern) | ||
n, m = len(s), len(pattern) | ||
i = 0 | ||
while i <= n - m: | ||
j = m - 1 # bad character index in pattern | ||
while j >= 0: | ||
if s[i + j] != pattern[j]: break | ||
j -= 1 | ||
if j < 0: return i | ||
|
||
x = j - bc[ord(s[i + j])] | ||
y = 0 | ||
if j < m - 1: | ||
y = _move_by_good_suffix(j, suffix, prefix) | ||
i += max(x, y) | ||
return -1 | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
s = "Here is a simple example" | ||
pattern = "example" | ||
print(bm(s, pattern)) | ||
|
||
s = "abcdcccdc" | ||
pattern = "cccd" | ||
print(s.find(pattern) == bm(s, pattern)) |