|
|
- // Copyright (c) 2016 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package character
-
- import (
- "unicode/utf8"
-
- "github.com/blevesearch/bleve/analysis"
- )
-
- type IsTokenRune func(r rune) bool
-
- type CharacterTokenizer struct {
- isTokenRun IsTokenRune
- }
-
- func NewCharacterTokenizer(f IsTokenRune) *CharacterTokenizer {
- return &CharacterTokenizer{
- isTokenRun: f,
- }
- }
-
- func (c *CharacterTokenizer) Tokenize(input []byte) analysis.TokenStream {
-
- rv := make(analysis.TokenStream, 0, 1024)
-
- offset := 0
- start := 0
- end := 0
- count := 0
- for currRune, size := utf8.DecodeRune(input[offset:]); currRune != utf8.RuneError; currRune, size = utf8.DecodeRune(input[offset:]) {
- isToken := c.isTokenRun(currRune)
- if isToken {
- end = offset + size
- } else {
- if end-start > 0 {
- // build token
- rv = append(rv, &analysis.Token{
- Term: input[start:end],
- Start: start,
- End: end,
- Position: count + 1,
- Type: analysis.AlphaNumeric,
- })
- count++
- }
- start = offset + size
- end = start
- }
- offset += size
- }
- // if we ended in the middle of a token, finish it
- if end-start > 0 {
- // build token
- rv = append(rv, &analysis.Token{
- Term: input[start:end],
- Start: start,
- End: end,
- Position: count + 1,
- Type: analysis.AlphaNumeric,
- })
- }
- return rv
- }
|