You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
3.0 KiB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package analysis
  15. // TokenLocation represents one occurrence of a term at a particular location in
  16. // a field. Start, End and Position have the same meaning as in analysis.Token.
  17. // Field and ArrayPositions identify the field value in the source document.
  18. // See document.Field for details.
  19. type TokenLocation struct {
  20. Field string
  21. ArrayPositions []uint64
  22. Start int
  23. End int
  24. Position int
  25. }
  26. // TokenFreq represents all the occurrences of a term in all fields of a
  27. // document.
  28. type TokenFreq struct {
  29. Term []byte
  30. Locations []*TokenLocation
  31. frequency int
  32. }
  33. func (tf *TokenFreq) Frequency() int {
  34. return tf.frequency
  35. }
  36. // TokenFrequencies maps document terms to their combined frequencies from all
  37. // fields.
  38. type TokenFrequencies map[string]*TokenFreq
  39. func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
  40. // walk the new token frequencies
  41. for tfk, tf := range other {
  42. // set the remoteField value in incoming token freqs
  43. for _, l := range tf.Locations {
  44. l.Field = remoteField
  45. }
  46. existingTf, exists := tfs[tfk]
  47. if exists {
  48. existingTf.Locations = append(existingTf.Locations, tf.Locations...)
  49. existingTf.frequency = existingTf.frequency + tf.frequency
  50. } else {
  51. tfs[tfk] = &TokenFreq{
  52. Term: tf.Term,
  53. frequency: tf.frequency,
  54. Locations: make([]*TokenLocation, len(tf.Locations)),
  55. }
  56. copy(tfs[tfk].Locations, tf.Locations)
  57. }
  58. }
  59. }
  60. func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
  61. rv := make(map[string]*TokenFreq, len(tokens))
  62. if includeTermVectors {
  63. tls := make([]TokenLocation, len(tokens))
  64. tlNext := 0
  65. for _, token := range tokens {
  66. tls[tlNext] = TokenLocation{
  67. ArrayPositions: arrayPositions,
  68. Start: token.Start,
  69. End: token.End,
  70. Position: token.Position,
  71. }
  72. curr, ok := rv[string(token.Term)]
  73. if ok {
  74. curr.Locations = append(curr.Locations, &tls[tlNext])
  75. curr.frequency++
  76. } else {
  77. rv[string(token.Term)] = &TokenFreq{
  78. Term: token.Term,
  79. Locations: []*TokenLocation{&tls[tlNext]},
  80. frequency: 1,
  81. }
  82. }
  83. tlNext++
  84. }
  85. } else {
  86. for _, token := range tokens {
  87. curr, exists := rv[string(token.Term)]
  88. if exists {
  89. curr.frequency++
  90. } else {
  91. rv[string(token.Term)] = &TokenFreq{
  92. Term: token.Term,
  93. frequency: 1,
  94. }
  95. }
  96. }
  97. }
  98. return rv
  99. }