Browse Source

Bm: Comments

master
Nathan Bergey 2 years ago
parent
commit
9b59a4be49
  1. 53
      src/bm.py

53
src/bm.py

@ -2,8 +2,32 @@ from typing import Dict
def make_km_table(pattern: str) -> Dict[str, int]: def make_km_table(pattern: str) -> Dict[str, int]:
"""
Build a lookup table from a pattern string.
Iterates over the string and creates a dictionary where the key is a
search character and the value is the distance from the end of the string
to the *last* occurrence in the pattern. We ignore the final character.
Example:
pattern = "ABCDAB"
543210 <- distance from end of pattern
table = {
"A": 1, # The second 'A'
"B": 4, # The first 'B', because the other 'B' is the end of the string
"C": 3,
"D": 2,
}
"""
PATTERN_LENGTH = len(pattern) PATTERN_LENGTH = len(pattern)
table = {} table = {}
# Note: Because we always start each loop in the search by comparing the
# last character in the pattern, we can skip it in the table. This is
# especially helpful if the last character also appears elsewhere in the
# pattern because when using this table we will have already tried matching
# the last character, so we want to try the *next* occurrence of that
# character in the pattern, if it exists.
for index_from_left, char in enumerate(pattern[:-1]): for index_from_left, char in enumerate(pattern[:-1]):
table[char] = PATTERN_LENGTH - index_from_left - 1 table[char] = PATTERN_LENGTH - index_from_left - 1
return table return table
@ -16,22 +40,51 @@ class Bm(object):
self.table = make_km_table(pattern) self.table = make_km_table(pattern)
def decide_slide_width(self, c: str) -> int: def decide_slide_width(self, c: str) -> int:
"""
If a character 'c' doesn't match in the search, this decides how far
down to "slide" the pattern for the next search.
"""
# Note: The lookup table only has characters from the pattern in it.
# If the test character 'c' is not in the table, then we should skip
# down the entire length of the pattern
try: try:
return self.table[c] return self.table[c]
except KeyError: except KeyError:
return len(self.pattern) return len(self.pattern)
def search(self) -> int: def search(self) -> int:
"""
Return the index of the *first* occurrence of the pattern in the
text, or '-1' if the pattern is not in the text.
"""
# The search works on one character at a time in the text. The current
# location is the "head" (like a read-head on a disk)
# It starts at the last character in the pattern, and gets moved in the
# loop until we reach the end of the text
PATTERN_LENGTH = len(self.pattern) PATTERN_LENGTH = len(self.pattern)
head = PATTERN_LENGTH - 1 head = PATTERN_LENGTH - 1
while head < len(self.text): while head < len(self.text):
# When the last character of the pattern matches the current head
# position, walk backwards through the text and match each
# character one by one until either they all match (return) or one
# doesn't match (slide the head down)
if self.text[head] == self.pattern[-1]: if self.text[head] == self.pattern[-1]:
for index_from_right, char in enumerate(reversed(self.pattern)): for index_from_right, char in enumerate(reversed(self.pattern)):
if self.text[head - index_from_right] != char: if self.text[head - index_from_right] != char:
head += self.decide_slide_width(self.text[head]) head += self.decide_slide_width(self.text[head])
break break
# If the for loop finished without breaking we found a full
# match!
else: else:
# Note: head is always at the end of the pattern, but the
# search should return the index of the start of the pattern
return head - PATTERN_LENGTH + 1 return head - PATTERN_LENGTH + 1
# When the last character of the pattern *doesn't* match the current
# head, always slide down the head
else: else:
head += self.decide_slide_width(self.text[head]) head += self.decide_slide_width(self.text[head])
# If the main loop doesn't return, there was no match
return -1 return -1
Loading…
Cancel
Save