You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
4.7 KiB

  1. // Copyright (c) 2012-2016 The go-diff authors. All rights reserved.
  2. // https://github.com/sergi/go-diff
  3. // See the included LICENSE file for license details.
  4. //
  5. // go-diff is a Go implementation of Google's Diff, Match, and Patch library
  6. // Original library is Copyright (c) 2006 Google Inc.
  7. // http://code.google.com/p/google-diff-match-patch/
  8. package diffmatchpatch
  9. import (
  10. "math"
  11. )
  12. // MatchMain locates the best instance of 'pattern' in 'text' near 'loc'.
  13. // Returns -1 if no match found.
  14. func (dmp *DiffMatchPatch) MatchMain(text, pattern string, loc int) int {
  15. // Check for null inputs not needed since null can't be passed in C#.
  16. loc = int(math.Max(0, math.Min(float64(loc), float64(len(text)))))
  17. if text == pattern {
  18. // Shortcut (potentially not guaranteed by the algorithm)
  19. return 0
  20. } else if len(text) == 0 {
  21. // Nothing to match.
  22. return -1
  23. } else if loc+len(pattern) <= len(text) && text[loc:loc+len(pattern)] == pattern {
  24. // Perfect match at the perfect spot! (Includes case of null pattern)
  25. return loc
  26. }
  27. // Do a fuzzy compare.
  28. return dmp.MatchBitap(text, pattern, loc)
  29. }
  30. // MatchBitap locates the best instance of 'pattern' in 'text' near 'loc' using the Bitap algorithm.
  31. // Returns -1 if no match was found.
  32. func (dmp *DiffMatchPatch) MatchBitap(text, pattern string, loc int) int {
  33. // Initialise the alphabet.
  34. s := dmp.MatchAlphabet(pattern)
  35. // Highest score beyond which we give up.
  36. scoreThreshold := dmp.MatchThreshold
  37. // Is there a nearby exact match? (speedup)
  38. bestLoc := indexOf(text, pattern, loc)
  39. if bestLoc != -1 {
  40. scoreThreshold = math.Min(dmp.matchBitapScore(0, bestLoc, loc,
  41. pattern), scoreThreshold)
  42. // What about in the other direction? (speedup)
  43. bestLoc = lastIndexOf(text, pattern, loc+len(pattern))
  44. if bestLoc != -1 {
  45. scoreThreshold = math.Min(dmp.matchBitapScore(0, bestLoc, loc,
  46. pattern), scoreThreshold)
  47. }
  48. }
  49. // Initialise the bit arrays.
  50. matchmask := 1 << uint((len(pattern) - 1))
  51. bestLoc = -1
  52. var binMin, binMid int
  53. binMax := len(pattern) + len(text)
  54. lastRd := []int{}
  55. for d := 0; d < len(pattern); d++ {
  56. // Scan for the best match; each iteration allows for one more error. Run a binary search to determine how far from 'loc' we can stray at this error level.
  57. binMin = 0
  58. binMid = binMax
  59. for binMin < binMid {
  60. if dmp.matchBitapScore(d, loc+binMid, loc, pattern) <= scoreThreshold {
  61. binMin = binMid
  62. } else {
  63. binMax = binMid
  64. }
  65. binMid = (binMax-binMin)/2 + binMin
  66. }
  67. // Use the result from this iteration as the maximum for the next.
  68. binMax = binMid
  69. start := int(math.Max(1, float64(loc-binMid+1)))
  70. finish := int(math.Min(float64(loc+binMid), float64(len(text))) + float64(len(pattern)))
  71. rd := make([]int, finish+2)
  72. rd[finish+1] = (1 << uint(d)) - 1
  73. for j := finish; j >= start; j-- {
  74. var charMatch int
  75. if len(text) <= j-1 {
  76. // Out of range.
  77. charMatch = 0
  78. } else if _, ok := s[text[j-1]]; !ok {
  79. charMatch = 0
  80. } else {
  81. charMatch = s[text[j-1]]
  82. }
  83. if d == 0 {
  84. // First pass: exact match.
  85. rd[j] = ((rd[j+1] << 1) | 1) & charMatch
  86. } else {
  87. // Subsequent passes: fuzzy match.
  88. rd[j] = ((rd[j+1]<<1)|1)&charMatch | (((lastRd[j+1] | lastRd[j]) << 1) | 1) | lastRd[j+1]
  89. }
  90. if (rd[j] & matchmask) != 0 {
  91. score := dmp.matchBitapScore(d, j-1, loc, pattern)
  92. // This match will almost certainly be better than any existing match. But check anyway.
  93. if score <= scoreThreshold {
  94. // Told you so.
  95. scoreThreshold = score
  96. bestLoc = j - 1
  97. if bestLoc > loc {
  98. // When passing loc, don't exceed our current distance from loc.
  99. start = int(math.Max(1, float64(2*loc-bestLoc)))
  100. } else {
  101. // Already passed loc, downhill from here on in.
  102. break
  103. }
  104. }
  105. }
  106. }
  107. if dmp.matchBitapScore(d+1, loc, loc, pattern) > scoreThreshold {
  108. // No hope for a (better) match at greater error levels.
  109. break
  110. }
  111. lastRd = rd
  112. }
  113. return bestLoc
  114. }
  115. // matchBitapScore computes and returns the score for a match with e errors and x location.
  116. func (dmp *DiffMatchPatch) matchBitapScore(e, x, loc int, pattern string) float64 {
  117. accuracy := float64(e) / float64(len(pattern))
  118. proximity := math.Abs(float64(loc - x))
  119. if dmp.MatchDistance == 0 {
  120. // Dodge divide by zero error.
  121. if proximity == 0 {
  122. return accuracy
  123. }
  124. return 1.0
  125. }
  126. return accuracy + (proximity / float64(dmp.MatchDistance))
  127. }
  128. // MatchAlphabet initialises the alphabet for the Bitap algorithm.
  129. func (dmp *DiffMatchPatch) MatchAlphabet(pattern string) map[byte]int {
  130. s := map[byte]int{}
  131. charPattern := []byte(pattern)
  132. for _, c := range charPattern {
  133. _, ok := s[c]
  134. if !ok {
  135. s[c] = 0
  136. }
  137. }
  138. i := 0
  139. for _, c := range charPattern {
  140. value := s[c] | int(uint(1)<<uint((len(pattern)-i-1)))
  141. s[c] = value
  142. i++
  143. }
  144. return s
  145. }