You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
3.6 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. from typing import Dict
  2. def make_km_table(pattern: str) -> Dict[str, int]:
  3. """
  4. Build a lookup table from a pattern string.
  5. Iterates over the string and creates a dictionary where the key is a
  6. search character and the value is the distance from the end of the string
  7. to the *last* occurrence in the pattern. We ignore the final character.
  8. Example:
  9. pattern = "ABCDAB"
  10. 543210 <- distance from end of pattern
  11. table = {
  12. "A": 1, # The second 'A'
  13. "B": 4, # The first 'B', because the other 'B' is the end of the string
  14. "C": 3,
  15. "D": 2,
  16. }
  17. """
  18. PATTERN_LENGTH = len(pattern)
  19. table = {}
  20. # Note: Because we always start each loop in the search by comparing the
  21. # last character in the pattern, we can skip it in the table. This is
  22. # especially helpful if the last character also appears elsewhere in the
  23. # pattern because when using this table we will have already tried matching
  24. # the last character, so we want to try the *next* occurrence of that
  25. # character in the pattern, if it exists.
  26. for index_from_left, char in enumerate(pattern[:-1]):
  27. table[char] = PATTERN_LENGTH - index_from_left - 1
  28. return table
  29. class Bm(object):
  30. def __init__(self, text: str, pattern: str):
  31. self.text = text
  32. self.pattern = pattern
  33. self.table = make_km_table(pattern)
  34. def decide_slide_width(self, c: str) -> int:
  35. """
  36. If a character 'c' doesn't match in the search, this decides how far
  37. down to "slide" the pattern for the next search.
  38. """
  39. # Note: The lookup table only has characters from the pattern in it.
  40. # If the test character 'c' is not in the table, then we should skip
  41. # down the entire length of the pattern
  42. try:
  43. return self.table[c]
  44. except KeyError:
  45. return len(self.pattern)
  46. def search(self) -> int:
  47. """
  48. Return the index of the *first* occurrence of the pattern in the
  49. text, or '-1' if the pattern is not in the text.
  50. """
  51. # The search works on one character at a time in the text. The current
  52. # location is the "head" (like a read-head on a disk)
  53. # It starts at the last character in the pattern, and gets moved in the
  54. # loop until we reach the end of the text
  55. PATTERN_LENGTH = len(self.pattern)
  56. head = PATTERN_LENGTH - 1
  57. while head < len(self.text):
  58. # When the last character of the pattern matches the current head
  59. # position, walk backwards through the text and match each
  60. # character one by one until either they all match (return) or one
  61. # doesn't match (slide the head down)
  62. if self.text[head] == self.pattern[-1]:
  63. for index_from_right, char in enumerate(reversed(self.pattern)):
  64. if self.text[head - index_from_right] != char:
  65. head += self.decide_slide_width(self.text[head])
  66. break
  67. # If the for loop finished without breaking we found a full
  68. # match!
  69. else:
  70. # Note: head is always at the end of the pattern, but the
  71. # search should return the index of the start of the pattern
  72. return head - PATTERN_LENGTH + 1
  73. # When the last character of the pattern *doesn't* match the current
  74. # head, always slide down the head
  75. else:
  76. head += self.decide_slide_width(self.text[head])
  77. # If the main loop doesn't return, there was no match
  78. return -1