You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.7 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. from typing import Dict
  2. def make_km_table(pattern: str) -> Dict[str, int]:
  3. """
  4. Build a lookup table from a pattern string.
  5. Iterates over the string and creates a dictionary where the key is a
  6. search character and the value is the distance from the end of the string
  7. to the *last* occurrence in the pattern. We ignore the final character.
  8. Example:
  9. pattern = "ABCDAB"
  10. 543210 <- distance from end of pattern
  11. table = {
  12. "A": 1, # The second 'A'
  13. "B": 4, # The first 'B', because the other 'B' is the end of the string
  14. "C": 3,
  15. "D": 2,
  16. }
  17. """
  18. PATTERN_LENGTH = len(pattern)
  19. table = {}
  20. # Note: Because we always start each loop in the search by comparing the
  21. # last character in the pattern, we can skip it in the table. This is
  22. # especially helpful if the last character also appears elsewhere in the
  23. # pattern because when using this table we will have already tried matching
  24. # the last character, so we want to try the *next* occurrence of that
  25. # character in the pattern, if it exists.
  26. for index_from_left, char in enumerate(pattern[:-1]):
  27. table[char] = PATTERN_LENGTH - index_from_left - 1
  28. return table
  29. class Bm(object):
  30. def __init__(self, text: str, pattern: str):
  31. self.text = text
  32. self.pattern = pattern
  33. self.table = make_km_table(pattern)
  34. def decide_slide_width(self, c: str) -> int:
  35. """
  36. If a character 'c' doesn't match in the search, this decides how far
  37. down to "slide" the pattern for the next search.
  38. """
  39. # Note: The lookup table only has characters from the pattern in it.
  40. # If the test character 'c' is not in the table, then we should skip
  41. # down the entire length of the pattern
  42. try:
  43. return self.table[c]
  44. except KeyError:
  45. return len(self.pattern)
  46. def search(self) -> int:
  47. """
  48. Return the index of the *first* occurrence of the pattern in the
  49. text, or '-1' if the pattern is not in the text.
  50. """
  51. PATTERN_LENGTH = len(self.pattern)
  52. if not self.pattern or not self.text or len(self.text) < PATTERN_LENGTH:
  53. return -1
  54. # The search works on one character at a time in the text. The current
  55. # location is the "head" (like a read-head on a disk)
  56. # It starts at the last character in the pattern, and gets moved in the
  57. # loop until we reach the end of the text
  58. head = PATTERN_LENGTH - 1
  59. while head < len(self.text):
  60. # When the last character of the pattern matches the current head
  61. # position, walk backwards through the text and match each
  62. # character one by one until either they all match (return) or one
  63. # doesn't match (slide the head down)
  64. if self.text[head] == self.pattern[-1]:
  65. for index_from_right, char in enumerate(reversed(self.pattern)):
  66. if self.text[head - index_from_right] != char:
  67. head += self.decide_slide_width(self.text[head])
  68. break
  69. # If the for loop finished without breaking we found a full
  70. # match!
  71. else:
  72. # Note: head is always at the end of the pattern, but the
  73. # search should return the index of the start of the pattern
  74. return head - PATTERN_LENGTH + 1
  75. # When the last character of the pattern *doesn't* match the current
  76. # head, always slide down the head
  77. else:
  78. head += self.decide_slide_width(self.text[head])
  79. # If the main loop doesn't return, there was no match
  80. return -1