from typing import Dict def make_km_table(pattern: str) -> Dict[str, int]: """ Build a lookup table from a pattern string. Iterates over the string and creates a dictionary where the key is a search character and the value is the distance from the end of the string to the *last* occurrence in the pattern. We ignore the final character. Example: pattern = "ABCDAB" 543210 <- distance from end of pattern table = { "A": 1, # The second 'A' "B": 4, # The first 'B', because the other 'B' is the end of the string "C": 3, "D": 2, } """ PATTERN_LENGTH = len(pattern) table = {} # Note: Because we always start each loop in the search by comparing the # last character in the pattern, we can skip it in the table. This is # especially helpful if the last character also appears elsewhere in the # pattern because when using this table we will have already tried matching # the last character, so we want to try the *next* occurrence of that # character in the pattern, if it exists. for index_from_left, char in enumerate(pattern[:-1]): table[char] = PATTERN_LENGTH - index_from_left - 1 return table class Bm(object): def __init__(self, text: str, pattern: str): self.text = text self.pattern = pattern self.table = make_km_table(pattern) def decide_slide_width(self, c: str) -> int: """ If a character 'c' doesn't match in the search, this decides how far down to "slide" the pattern for the next search. """ # Note: The lookup table only has characters from the pattern in it. # If the test character 'c' is not in the table, then we should skip # down the entire length of the pattern try: return self.table[c] except KeyError: return len(self.pattern) def search(self) -> int: """ Return the index of the *first* occurrence of the pattern in the text, or '-1' if the pattern is not in the text. """ # The search works on one character at a time in the text. The current # location is the "head" (like a read-head on a disk) # It starts at the last character in the pattern, and gets moved in the # loop until we reach the end of the text PATTERN_LENGTH = len(self.pattern) head = PATTERN_LENGTH - 1 while head < len(self.text): # When the last character of the pattern matches the current head # position, walk backwards through the text and match each # character one by one until either they all match (return) or one # doesn't match (slide the head down) if self.text[head] == self.pattern[-1]: for index_from_right, char in enumerate(reversed(self.pattern)): if self.text[head - index_from_right] != char: head += self.decide_slide_width(self.text[head]) break # If the for loop finished without breaking we found a full # match! else: # Note: head is always at the end of the pattern, but the # search should return the index of the start of the pattern return head - PATTERN_LENGTH + 1 # When the last character of the pattern *doesn't* match the current # head, always slide down the head else: head += self.decide_slide_width(self.text[head]) # If the main loop doesn't return, there was no match return -1