5 Commits

  1. 2
      .gitignore
  2. 79
      src/bm.py
  3. 3
      test/test_bm.py

2
.gitignore

@ -1,4 +1,6 @@
*.py[cod]
.pytest_cache/
.venv/
__pycache__
Pipfile.lock

79
src/bm.py

@ -2,8 +2,34 @@ from typing import Dict
def make_km_table(pattern: str) -> Dict[str, int]:
table = dict()
raise Exception("TODO")
"""
Build a lookup table from a pattern string.
Iterates over the string and creates a dictionary where the key is a
search character and the value is the distance from the end of the string
to the *last* occurrence in the pattern. We ignore the final character.
Example:
pattern = "ABCDAB"
543210 <- distance from end of pattern
table = {
"A": 1, # The second 'A'
"B": 4, # The first 'B', because the other 'B' is the end of the string
"C": 3,
"D": 2,
}
"""
PATTERN_LENGTH = len(pattern)
table = {}
# Note: Because we always start each loop in the search by comparing the
# last character in the pattern, we can skip it in the table. This is
# especially helpful if the last character also appears elsewhere in the
# pattern because when using this table we will have already tried matching
# the last character, so we want to try the *next* occurrence of that
# character in the pattern, if it exists.
for index_from_left, char in enumerate(pattern[:-1]):
table[char] = PATTERN_LENGTH - index_from_left - 1
return table
@ -14,10 +40,51 @@ class Bm(object):
self.table = make_km_table(pattern)
def decide_slide_width(self, c: str) -> int:
assert len(c) == 1
raise Exception("TODO")
return -1
"""
If a character 'c' doesn't match in the search, this decides how far
down to "slide" the pattern for the next search.
"""
# Note: The lookup table only has characters from the pattern in it.
# If the test character 'c' is not in the table, then we should skip
# down the entire length of the pattern
try:
return self.table[c]
except KeyError:
return len(self.pattern)
def search(self) -> int:
raise Exception("TODO")
"""
Return the index of the *first* occurrence of the pattern in the
text, or '-1' if the pattern is not in the text.
"""
# The search works on one character at a time in the text. The current
# location is the "head" (like a read-head on a disk)
# It starts at the last character in the pattern, and gets moved in the
# loop until we reach the end of the text
PATTERN_LENGTH = len(self.pattern)
head = PATTERN_LENGTH - 1
while head < len(self.text):
# When the last character of the pattern matches the current head
# position, walk backwards through the text and match each
# character one by one until either they all match (return) or one
# doesn't match (slide the head down)
if self.text[head] == self.pattern[-1]:
for index_from_right, char in enumerate(reversed(self.pattern)):
if self.text[head - index_from_right] != char:
head += self.decide_slide_width(self.text[head])
break
# If the for loop finished without breaking we found a full
# match!
else:
# Note: head is always at the end of the pattern, but the
# search should return the index of the start of the pattern
return head - PATTERN_LENGTH + 1
# When the last character of the pattern *doesn't* match the current
# head, always slide down the head
else:
head += self.decide_slide_width(self.text[head])
# If the main loop doesn't return, there was no match
return -1

3
test/test_bm.py

@ -11,6 +11,9 @@ from src.bm import Bm
("ANPANMAN", "PAN", 2),
("ANPANMAN", "ANPAN", 0),
("ANPANMAN", "BIKINMAN", -1),
# Other useful test cases:
("ユニコード", "コード", 2),
])
def test_bm(text, pattern, expected):
bm = Bm(text, pattern)

Loading…
Cancel
Save