@ -0,0 +1,53 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package unique | |||
import ( | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/registry" | |||
) | |||
const Name = "unique" | |||
// UniqueTermFilter retains only the tokens which mark the first occurence of | |||
// a term. Tokens whose term appears in a preceding token are dropped. | |||
type UniqueTermFilter struct{} | |||
func NewUniqueTermFilter() *UniqueTermFilter { | |||
return &UniqueTermFilter{} | |||
} | |||
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream { | |||
encounteredTerms := make(map[string]struct{}, len(input)/4) | |||
j := 0 | |||
for _, token := range input { | |||
term := string(token.Term) | |||
if _, ok := encounteredTerms[term]; ok { | |||
continue | |||
} | |||
encounteredTerms[term] = struct{}{} | |||
input[j] = token | |||
j++ | |||
} | |||
return input[:j] | |||
} | |||
func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { | |||
return NewUniqueTermFilter(), nil | |||
} | |||
func init() { | |||
registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor) | |||
} |
@ -0,0 +1,173 @@ | |||
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. | |||
[[projects]] | |||
name = "github.com/RoaringBitmap/roaring" | |||
packages = ["."] | |||
revision = "84551f0e309d6f9bafa428ef39b31ab7f16ff7b8" | |||
version = "v0.4.1" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/Smerity/govarint" | |||
packages = ["."] | |||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" | |||
[[projects]] | |||
name = "github.com/blevesearch/bleve" | |||
packages = [ | |||
".", | |||
"analysis", | |||
"analysis/analyzer/standard", | |||
"analysis/datetime/flexible", | |||
"analysis/datetime/optional", | |||
"analysis/lang/en", | |||
"analysis/token/lowercase", | |||
"analysis/token/porter", | |||
"analysis/token/stop", | |||
"analysis/tokenizer/unicode", | |||
"document", | |||
"geo", | |||
"index", | |||
"index/scorch", | |||
"index/scorch/mergeplan", | |||
"index/scorch/segment", | |||
"index/scorch/segment/mem", | |||
"index/scorch/segment/zap", | |||
"index/store", | |||
"index/store/boltdb", | |||
"index/store/gtreap", | |||
"index/upsidedown", | |||
"mapping", | |||
"numeric", | |||
"registry", | |||
"search", | |||
"search/collector", | |||
"search/facet", | |||
"search/highlight", | |||
"search/highlight/format/html", | |||
"search/highlight/fragmenter/simple", | |||
"search/highlight/highlighter/html", | |||
"search/highlight/highlighter/simple", | |||
"search/query", | |||
"search/scorer", | |||
"search/searcher" | |||
] | |||
revision = "a3b125508b4443344b596888ca58467b6c9310b9" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/blevesearch/go-porterstemmer" | |||
packages = ["."] | |||
revision = "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/blevesearch/segment" | |||
packages = ["."] | |||
revision = "762005e7a34fd909a84586299f1dd457371d36ee" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/boltdb/bolt" | |||
packages = ["."] | |||
revision = "9da31745363232bc1e27dbab3569e77383a51585" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/couchbase/vellum" | |||
packages = [ | |||
".", | |||
"regexp", | |||
"utf8" | |||
] | |||
revision = "ed84a675e24ed0a0bf6859b1ddec7e7c858354bd" | |||
[[projects]] | |||
name = "github.com/davecgh/go-spew" | |||
packages = ["spew"] | |||
revision = "346938d642f2ec3594ed81d874461961cd0faa76" | |||
version = "v1.1.0" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/edsrzf/mmap-go" | |||
packages = ["."] | |||
revision = "0bce6a6887123b67a60366d2c9fe2dfb74289d2e" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/glycerine/go-unsnap-stream" | |||
packages = ["."] | |||
revision = "62a9a9eb44fd8932157b1a8ace2149eff5971af6" | |||
[[projects]] | |||
name = "github.com/golang/protobuf" | |||
packages = ["proto"] | |||
revision = "925541529c1fa6821df4e44ce2723319eb2be768" | |||
version = "v1.0.0" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/golang/snappy" | |||
packages = ["."] | |||
revision = "553a641470496b2327abcac10b36396bd98e45c9" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/mschoch/smat" | |||
packages = ["."] | |||
revision = "90eadee771aeab36e8bf796039b8c261bebebe4f" | |||
[[projects]] | |||
name = "github.com/philhofer/fwd" | |||
packages = ["."] | |||
revision = "bb6d471dc95d4fe11e432687f8b70ff496cf3136" | |||
version = "v1.0.0" | |||
[[projects]] | |||
name = "github.com/pmezard/go-difflib" | |||
packages = ["difflib"] | |||
revision = "792786c7400a136282c1664665ae0a8db921c6c2" | |||
version = "v1.0.0" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/steveyen/gtreap" | |||
packages = ["."] | |||
revision = "0abe01ef9be25c4aedc174758ec2d917314d6d70" | |||
[[projects]] | |||
name = "github.com/stretchr/testify" | |||
packages = ["assert"] | |||
revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" | |||
version = "v1.2.1" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/tinylib/msgp" | |||
packages = ["msgp"] | |||
revision = "03a79185462ad029a6e7e05b2f3f3e0498d0a6c0" | |||
[[projects]] | |||
branch = "master" | |||
name = "github.com/willf/bitset" | |||
packages = ["."] | |||
revision = "1a37ad96e8c1a11b20900a232874843b5174221f" | |||
[[projects]] | |||
name = "golang.org/x/net" | |||
packages = ["context"] | |||
revision = "309822c5b9b9f80db67f016069a12628d94fad34" | |||
[[projects]] | |||
name = "golang.org/x/sys" | |||
packages = ["unix"] | |||
revision = "3dbebcf8efb6a5011a60c2b4591c1022a759af8a" | |||
[solve-meta] | |||
analyzer-name = "dep" | |||
analyzer-version = 1 | |||
inputs-digest = "61c759f0c1136cadf86ae8a30bb78edf33fc844cdcb2316469b4ae14a8d051b0" | |||
solver-name = "gps-cdcl" | |||
solver-version = 1 |
@ -0,0 +1,34 @@ | |||
# Gopkg.toml example | |||
# | |||
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md | |||
# for detailed Gopkg.toml documentation. | |||
# | |||
# required = ["github.com/user/thing/cmd/thing"] | |||
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] | |||
# | |||
# [[constraint]] | |||
# name = "github.com/user/project" | |||
# version = "1.0.0" | |||
# | |||
# [[constraint]] | |||
# name = "github.com/user/project2" | |||
# branch = "dev" | |||
# source = "github.com/myfork/project2" | |||
# | |||
# [[override]] | |||
# name = "github.com/x/y" | |||
# version = "2.4.0" | |||
# | |||
# [prune] | |||
# non-go = false | |||
# go-tests = true | |||
# unused-packages = true | |||
[[constraint]] | |||
name = "github.com/stretchr/testify" | |||
version = "1.2.1" | |||
[prune] | |||
go-tests = true | |||
unused-packages = true |
@ -0,0 +1,21 @@ | |||
MIT License | |||
Copyright (c) 2018 Ethan Koenig | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. |
@ -0,0 +1,13 @@ | |||
# rupture | |||
[![Build Status](https://travis-ci.org/ethantkoenig/rupture.svg?branch=master)](https://travis-ci.org/ethantkoenig/rupture) [![GoDoc](https://godoc.org/github.com/ethantkoenig/rupture?status.svg)](https://godoc.org/github.com/ethantkoenig/rupture) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) | |||
An explosive companion to the [bleve indexing library](https://www.github.com/blevesearch/bleve) | |||
## Features | |||
`rupture` includes the following additions to `bleve`: | |||
- __Flushing batches__: Batches of operation which automatically flush to the underlying bleve index. | |||
- __Sharded indices__: An index-like abstraction built on top of several underlying indices. Sharded indices provide lower write latencies for indices with large amounts of data. | |||
- __Index metadata__: Track index version for easily managing migrations and schema changes. |
@ -0,0 +1,67 @@ | |||
package rupture | |||
import ( | |||
"github.com/blevesearch/bleve" | |||
) | |||
// FlushingBatch is a batch of operations that automatically flushes to the | |||
// underlying index once it reaches a certain size. | |||
type FlushingBatch interface { | |||
// Index adds the specified index operation batch, possibly triggering a | |||
// flush. | |||
Index(id string, data interface{}) error | |||
// Remove adds the specified delete operation to the batch, possibly | |||
// triggering a flush. | |||
Delete(id string) error | |||
// Flush flushes the batch's contents. | |||
Flush() error | |||
} | |||
type singleIndexFlushingBatch struct { | |||
maxBatchSize int | |||
batch *bleve.Batch | |||
index bleve.Index | |||
} | |||
func newFlushingBatch(index bleve.Index, maxBatchSize int) *singleIndexFlushingBatch { | |||
return &singleIndexFlushingBatch{ | |||
maxBatchSize: maxBatchSize, | |||
batch: index.NewBatch(), | |||
index: index, | |||
} | |||
} | |||
// NewFlushingBatch creates a new flushing batch for the specified index. Once | |||
// the number of operations in the batch reaches the specified limit, the batch | |||
// automatically flushes its operations to the index. | |||
func NewFlushingBatch(index bleve.Index, maxBatchSize int) FlushingBatch { | |||
return newFlushingBatch(index, maxBatchSize) | |||
} | |||
func (b *singleIndexFlushingBatch) Index(id string, data interface{}) error { | |||
if err := b.batch.Index(id, data); err != nil { | |||
return err | |||
} | |||
return b.flushIfFull() | |||
} | |||
func (b *singleIndexFlushingBatch) Delete(id string) error { | |||
b.batch.Delete(id) | |||
return b.flushIfFull() | |||
} | |||
func (b *singleIndexFlushingBatch) flushIfFull() error { | |||
if b.batch.Size() < b.maxBatchSize { | |||
return nil | |||
} | |||
return b.Flush() | |||
} | |||
func (b *singleIndexFlushingBatch) Flush() error { | |||
err := b.index.Batch(b.batch) | |||
if err != nil { | |||
return err | |||
} | |||
b.batch.Reset() | |||
return nil | |||
} |
@ -0,0 +1,68 @@ | |||
package rupture | |||
import ( | |||
"encoding/json" | |||
"io/ioutil" | |||
"os" | |||
"path/filepath" | |||
) | |||
const metaFilename = "rupture_meta.json" | |||
func indexMetadataPath(dir string) string { | |||
return filepath.Join(dir, metaFilename) | |||
} | |||
// IndexMetadata contains metadata about a bleve index. | |||
type IndexMetadata struct { | |||
// The version of the data in the index. This can be useful for tracking | |||
// schema changes or data migrations. | |||
Version int `json:"version"` | |||
} | |||
// in addition to the user-exposed metadata, we keep additional, internal-only | |||
// metadata for sharded indices. | |||
const shardedMetadataFilename = "rupture_sharded_meta.json" | |||
func shardedIndexMetadataPath(dir string) string { | |||
return filepath.Join(dir, shardedMetadataFilename) | |||
} | |||
type shardedIndexMetadata struct { | |||
NumShards int `json:"num_shards"` | |||
} | |||
func readJSON(path string, meta interface{}) error { | |||
metaBytes, err := ioutil.ReadFile(path) | |||
if err != nil { | |||
return err | |||
} | |||
return json.Unmarshal(metaBytes, meta) | |||
} | |||
func writeJSON(path string, meta interface{}) error { | |||
metaBytes, err := json.Marshal(meta) | |||
if err != nil { | |||
return err | |||
} | |||
return ioutil.WriteFile(path, metaBytes, 0666) | |||
} | |||
// ReadIndexMetadata returns the metadata for the index at the specified path. | |||
// If no such index metadata exists, an empty metadata and a nil error are | |||
// returned. | |||
func ReadIndexMetadata(path string) (*IndexMetadata, error) { | |||
meta := &IndexMetadata{} | |||
metaPath := indexMetadataPath(path) | |||
if _, err := os.Stat(metaPath); os.IsNotExist(err) { | |||
return meta, nil | |||
} else if err != nil { | |||
return nil, err | |||
} | |||
return meta, readJSON(metaPath, meta) | |||
} | |||
// WriteIndexMetadata writes metadata for the index at the specified path. | |||
func WriteIndexMetadata(path string, meta *IndexMetadata) error { | |||
return writeJSON(indexMetadataPath(path), meta) | |||
} |
@ -0,0 +1,146 @@ | |||
package rupture | |||
import ( | |||
"fmt" | |||
"hash/fnv" | |||
"path/filepath" | |||
"strconv" | |||
"github.com/blevesearch/bleve" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/mapping" | |||
) | |||
// ShardedIndex an index that is built onto of multiple underlying bleve | |||
// indices (i.e. shards). Similar to bleve's index aliases, some methods may | |||
// not be supported. | |||
type ShardedIndex interface { | |||
bleve.Index | |||
shards() []bleve.Index | |||
} | |||
// a type alias for bleve.Index, so that the anonymous field of | |||
// shardedIndex does not conflict with the Index(..) method. | |||
type bleveIndex bleve.Index | |||
type shardedIndex struct { | |||
bleveIndex | |||
indices []bleve.Index | |||
} | |||
func hash(id string, n int) uint64 { | |||
fnvHash := fnv.New64() | |||
fnvHash.Write([]byte(id)) | |||
return fnvHash.Sum64() % uint64(n) | |||
} | |||
func childIndexerPath(rootPath string, i int) string { | |||
return filepath.Join(rootPath, strconv.Itoa(i)) | |||
} | |||
// NewShardedIndex creates a sharded index at the specified path, with the | |||
// specified mapping and number of shards. | |||
func NewShardedIndex(path string, mapping mapping.IndexMapping, numShards int) (ShardedIndex, error) { | |||
if numShards <= 0 { | |||
return nil, fmt.Errorf("Invalid number of shards: %d", numShards) | |||
} | |||
err := writeJSON(shardedIndexMetadataPath(path), &shardedIndexMetadata{NumShards: numShards}) | |||
if err != nil { | |||
return nil, err | |||
} | |||
s := &shardedIndex{ | |||
indices: make([]bleve.Index, numShards), | |||
} | |||
for i := 0; i < numShards; i++ { | |||
s.indices[i], err = bleve.New(childIndexerPath(path, i), mapping) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) | |||
return s, nil | |||
} | |||
// OpenShardedIndex opens a sharded index at the specified path. | |||
func OpenShardedIndex(path string) (ShardedIndex, error) { | |||
var meta shardedIndexMetadata | |||
var err error | |||
if err = readJSON(shardedIndexMetadataPath(path), &meta); err != nil { | |||
return nil, err | |||
} | |||
s := &shardedIndex{ | |||
indices: make([]bleve.Index, meta.NumShards), | |||
} | |||
for i := 0; i < meta.NumShards; i++ { | |||
s.indices[i], err = bleve.Open(childIndexerPath(path, i)) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) | |||
return s, nil | |||
} | |||
func (s *shardedIndex) Index(id string, data interface{}) error { | |||
return s.indices[hash(id, len(s.indices))].Index(id, data) | |||
} | |||
func (s *shardedIndex) Delete(id string) error { | |||
return s.indices[hash(id, len(s.indices))].Delete(id) | |||
} | |||
func (s *shardedIndex) Document(id string) (*document.Document, error) { | |||
return s.indices[hash(id, len(s.indices))].Document(id) | |||
} | |||
func (s *shardedIndex) Close() error { | |||
if err := s.bleveIndex.Close(); err != nil { | |||
return err | |||
} | |||
for _, index := range s.indices { | |||
if err := index.Close(); err != nil { | |||
return err | |||
} | |||
} | |||
return nil | |||
} | |||
func (s *shardedIndex) shards() []bleve.Index { | |||
return s.indices | |||
} | |||
type shardedIndexFlushingBatch struct { | |||
batches []*singleIndexFlushingBatch | |||
} | |||
// NewShardedFlushingBatch creates a flushing batch with the specified batch | |||
// size for the specified sharded index. | |||
func NewShardedFlushingBatch(index ShardedIndex, maxBatchSize int) FlushingBatch { | |||
indices := index.shards() | |||
b := &shardedIndexFlushingBatch{ | |||
batches: make([]*singleIndexFlushingBatch, len(indices)), | |||
} | |||
for i, index := range indices { | |||
b.batches[i] = newFlushingBatch(index, maxBatchSize) | |||
} | |||
return b | |||
} | |||
func (b *shardedIndexFlushingBatch) Index(id string, data interface{}) error { | |||
return b.batches[hash(id, len(b.batches))].Index(id, data) | |||
} | |||
func (b *shardedIndexFlushingBatch) Delete(id string) error { | |||
return b.batches[hash(id, len(b.batches))].Delete(id) | |||
} | |||
func (b *shardedIndexFlushingBatch) Flush() error { | |||
for _, batch := range b.batches { | |||
if err := batch.Flush(); err != nil { | |||
return err | |||
} | |||
} | |||
return nil | |||
} |