diff --git a/src/bm.py b/src/bm.py index 7763a58..05e2265 100644 --- a/src/bm.py +++ b/src/bm.py @@ -2,8 +2,32 @@ from typing import Dict def make_km_table(pattern: str) -> Dict[str, int]: + """ + Build a lookup table from a pattern string. + + Iterates over the string and creates a dictionary where the key is a + search character and the value is the distance from the end of the string + to the *last* occurrence in the pattern. We ignore the final character. + + Example: + pattern = "ABCDAB" + 543210 <- distance from end of pattern + table = { + "A": 1, # The second 'A' + "B": 4, # The first 'B', because the other 'B' is the end of the string + "C": 3, + "D": 2, + } + """ PATTERN_LENGTH = len(pattern) table = {} + + # Note: Because we always start each loop in the search by comparing the + # last character in the pattern, we can skip it in the table. This is + # especially helpful if the last character also appears elsewhere in the + # pattern because when using this table we will have already tried matching + # the last character, so we want to try the *next* occurrence of that + # character in the pattern, if it exists. for index_from_left, char in enumerate(pattern[:-1]): table[char] = PATTERN_LENGTH - index_from_left - 1 return table @@ -16,22 +40,51 @@ class Bm(object): self.table = make_km_table(pattern) def decide_slide_width(self, c: str) -> int: + """ + If a character 'c' doesn't match in the search, this decides how far + down to "slide" the pattern for the next search. + """ + + # Note: The lookup table only has characters from the pattern in it. + # If the test character 'c' is not in the table, then we should skip + # down the entire length of the pattern try: return self.table[c] except KeyError: return len(self.pattern) def search(self) -> int: + """ + Return the index of the *first* occurrence of the pattern in the + text, or '-1' if the pattern is not in the text. + """ + + # The search works on one character at a time in the text. The current + # location is the "head" (like a read-head on a disk) + # It starts at the last character in the pattern, and gets moved in the + # loop until we reach the end of the text PATTERN_LENGTH = len(self.pattern) head = PATTERN_LENGTH - 1 while head < len(self.text): + # When the last character of the pattern matches the current head + # position, walk backwards through the text and match each + # character one by one until either they all match (return) or one + # doesn't match (slide the head down) if self.text[head] == self.pattern[-1]: for index_from_right, char in enumerate(reversed(self.pattern)): if self.text[head - index_from_right] != char: head += self.decide_slide_width(self.text[head]) break + # If the for loop finished without breaking we found a full + # match! else: + # Note: head is always at the end of the pattern, but the + # search should return the index of the start of the pattern return head - PATTERN_LENGTH + 1 + # When the last character of the pattern *doesn't* match the current + # head, always slide down the head else: head += self.decide_slide_width(self.text[head]) + + # If the main loop doesn't return, there was no match return -1