You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.1 KiB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package searcher
  15. import (
  16. "fmt"
  17. "github.com/blevesearch/bleve/index"
  18. "github.com/blevesearch/bleve/search"
  19. )
  20. var MaxFuzziness = 2
  21. func NewFuzzySearcher(indexReader index.IndexReader, term string,
  22. prefix, fuzziness int, field string, boost float64,
  23. options search.SearcherOptions) (search.Searcher, error) {
  24. if fuzziness > MaxFuzziness {
  25. return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
  26. }
  27. if fuzziness < 0 {
  28. return nil, fmt.Errorf("invalid fuzziness, negative")
  29. }
  30. // Note: we don't byte slice the term for a prefix because of runes.
  31. prefixTerm := ""
  32. for i, r := range term {
  33. if i < prefix {
  34. prefixTerm += string(r)
  35. } else {
  36. break
  37. }
  38. }
  39. candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
  40. field, prefixTerm)
  41. if err != nil {
  42. return nil, err
  43. }
  44. return NewMultiTermSearcher(indexReader, candidateTerms, field,
  45. boost, options, true)
  46. }
  47. func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
  48. fuzziness int, field, prefixTerm string) (rv []string, err error) {
  49. rv = make([]string, 0)
  50. // in case of advanced reader implementations directly call
  51. // the levenshtein automaton based iterator to collect the
  52. // candidate terms
  53. if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
  54. fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm)
  55. if err != nil {
  56. return nil, err
  57. }
  58. defer func() {
  59. if cerr := fieldDict.Close(); cerr != nil && err == nil {
  60. err = cerr
  61. }
  62. }()
  63. tfd, err := fieldDict.Next()
  64. for err == nil && tfd != nil {
  65. rv = append(rv, tfd.Term)
  66. if tooManyClauses(len(rv)) {
  67. return nil, tooManyClausesErr(field, len(rv))
  68. }
  69. tfd, err = fieldDict.Next()
  70. }
  71. return rv, err
  72. }
  73. var fieldDict index.FieldDict
  74. if len(prefixTerm) > 0 {
  75. fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
  76. } else {
  77. fieldDict, err = indexReader.FieldDict(field)
  78. }
  79. if err != nil {
  80. return nil, err
  81. }
  82. defer func() {
  83. if cerr := fieldDict.Close(); cerr != nil && err == nil {
  84. err = cerr
  85. }
  86. }()
  87. // enumerate terms and check levenshtein distance
  88. var reuse []int
  89. tfd, err := fieldDict.Next()
  90. for err == nil && tfd != nil {
  91. var ld int
  92. var exceeded bool
  93. ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
  94. if !exceeded && ld <= fuzziness {
  95. rv = append(rv, tfd.Term)
  96. if tooManyClauses(len(rv)) {
  97. return nil, tooManyClausesErr(field, len(rv))
  98. }
  99. }
  100. tfd, err = fieldDict.Next()
  101. }
  102. return rv, err
  103. }