|
@ -2,8 +2,32 @@ from typing import Dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_km_table(pattern: str) -> Dict[str, int]: |
|
|
def make_km_table(pattern: str) -> Dict[str, int]: |
|
|
|
|
|
""" |
|
|
|
|
|
Build a lookup table from a pattern string. |
|
|
|
|
|
|
|
|
|
|
|
Iterates over the string and creates a dictionary where the key is a |
|
|
|
|
|
search character and the value is the distance from the end of the string |
|
|
|
|
|
to the *last* occurrence in the pattern. We ignore the final character. |
|
|
|
|
|
|
|
|
|
|
|
Example: |
|
|
|
|
|
pattern = "ABCDAB" |
|
|
|
|
|
543210 <- distance from end of pattern |
|
|
|
|
|
table = { |
|
|
|
|
|
"A": 1, # The second 'A' |
|
|
|
|
|
"B": 4, # The first 'B', because the other 'B' is the end of the string |
|
|
|
|
|
"C": 3, |
|
|
|
|
|
"D": 2, |
|
|
|
|
|
} |
|
|
|
|
|
""" |
|
|
PATTERN_LENGTH = len(pattern) |
|
|
PATTERN_LENGTH = len(pattern) |
|
|
table = {} |
|
|
table = {} |
|
|
|
|
|
|
|
|
|
|
|
# Note: Because we always start each loop in the search by comparing the |
|
|
|
|
|
# last character in the pattern, we can skip it in the table. This is |
|
|
|
|
|
# especially helpful if the last character also appears elsewhere in the |
|
|
|
|
|
# pattern because when using this table we will have already tried matching |
|
|
|
|
|
# the last character, so we want to try the *next* occurrence of that |
|
|
|
|
|
# character in the pattern, if it exists. |
|
|
for index_from_left, char in enumerate(pattern[:-1]): |
|
|
for index_from_left, char in enumerate(pattern[:-1]): |
|
|
table[char] = PATTERN_LENGTH - index_from_left - 1 |
|
|
table[char] = PATTERN_LENGTH - index_from_left - 1 |
|
|
return table |
|
|
return table |
|
@ -16,22 +40,51 @@ class Bm(object): |
|
|
self.table = make_km_table(pattern) |
|
|
self.table = make_km_table(pattern) |
|
|
|
|
|
|
|
|
def decide_slide_width(self, c: str) -> int: |
|
|
def decide_slide_width(self, c: str) -> int: |
|
|
|
|
|
""" |
|
|
|
|
|
If a character 'c' doesn't match in the search, this decides how far |
|
|
|
|
|
down to "slide" the pattern for the next search. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
# Note: The lookup table only has characters from the pattern in it. |
|
|
|
|
|
# If the test character 'c' is not in the table, then we should skip |
|
|
|
|
|
# down the entire length of the pattern |
|
|
try: |
|
|
try: |
|
|
return self.table[c] |
|
|
return self.table[c] |
|
|
except KeyError: |
|
|
except KeyError: |
|
|
return len(self.pattern) |
|
|
return len(self.pattern) |
|
|
|
|
|
|
|
|
def search(self) -> int: |
|
|
def search(self) -> int: |
|
|
|
|
|
""" |
|
|
|
|
|
Return the index of the *first* occurrence of the pattern in the |
|
|
|
|
|
text, or '-1' if the pattern is not in the text. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
# The search works on one character at a time in the text. The current |
|
|
|
|
|
# location is the "head" (like a read-head on a disk) |
|
|
|
|
|
# It starts at the last character in the pattern, and gets moved in the |
|
|
|
|
|
# loop until we reach the end of the text |
|
|
PATTERN_LENGTH = len(self.pattern) |
|
|
PATTERN_LENGTH = len(self.pattern) |
|
|
head = PATTERN_LENGTH - 1 |
|
|
head = PATTERN_LENGTH - 1 |
|
|
while head < len(self.text): |
|
|
while head < len(self.text): |
|
|
|
|
|
# When the last character of the pattern matches the current head |
|
|
|
|
|
# position, walk backwards through the text and match each |
|
|
|
|
|
# character one by one until either they all match (return) or one |
|
|
|
|
|
# doesn't match (slide the head down) |
|
|
if self.text[head] == self.pattern[-1]: |
|
|
if self.text[head] == self.pattern[-1]: |
|
|
for index_from_right, char in enumerate(reversed(self.pattern)): |
|
|
for index_from_right, char in enumerate(reversed(self.pattern)): |
|
|
if self.text[head - index_from_right] != char: |
|
|
if self.text[head - index_from_right] != char: |
|
|
head += self.decide_slide_width(self.text[head]) |
|
|
head += self.decide_slide_width(self.text[head]) |
|
|
break |
|
|
break |
|
|
|
|
|
# If the for loop finished without breaking we found a full |
|
|
|
|
|
# match! |
|
|
else: |
|
|
else: |
|
|
|
|
|
# Note: head is always at the end of the pattern, but the |
|
|
|
|
|
# search should return the index of the start of the pattern |
|
|
return head - PATTERN_LENGTH + 1 |
|
|
return head - PATTERN_LENGTH + 1 |
|
|
|
|
|
# When the last character of the pattern *doesn't* match the current |
|
|
|
|
|
# head, always slide down the head |
|
|
else: |
|
|
else: |
|
|
head += self.decide_slide_width(self.text[head]) |
|
|
head += self.decide_slide_width(self.text[head]) |
|
|
|
|
|
|
|
|
|
|
|
# If the main loop doesn't return, there was no match |
|
|
return -1 |
|
|
return -1 |