You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

53 lines
1.5 KiB

  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package unique
  15. import (
  16. "github.com/blevesearch/bleve/analysis"
  17. "github.com/blevesearch/bleve/registry"
  18. )
  19. const Name = "unique"
  20. // UniqueTermFilter retains only the tokens which mark the first occurence of
  21. // a term. Tokens whose term appears in a preceding token are dropped.
  22. type UniqueTermFilter struct{}
  23. func NewUniqueTermFilter() *UniqueTermFilter {
  24. return &UniqueTermFilter{}
  25. }
  26. func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
  27. encounteredTerms := make(map[string]struct{}, len(input)/4)
  28. j := 0
  29. for _, token := range input {
  30. term := string(token.Term)
  31. if _, ok := encounteredTerms[term]; ok {
  32. continue
  33. }
  34. encounteredTerms[term] = struct{}{}
  35. input[j] = token
  36. j++
  37. }
  38. return input[:j]
  39. }
  40. func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
  41. return NewUniqueTermFilter(), nil
  42. }
  43. func init() {
  44. registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor)
  45. }