You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
8.3 KiB

  1. // Copyright (c) 2019 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "fmt"
  17. "io/ioutil"
  18. "os"
  19. "sync"
  20. "github.com/RoaringBitmap/roaring"
  21. "github.com/blevesearch/bleve/document"
  22. "github.com/blevesearch/bleve/index"
  23. "github.com/blevesearch/bleve/index/scorch/segment"
  24. bolt "go.etcd.io/bbolt"
  25. )
  26. const DefaultBuilderBatchSize = 1000
  27. const DefaultBuilderMergeMax = 10
  28. type Builder struct {
  29. m sync.Mutex
  30. segCount uint64
  31. path string
  32. buildPath string
  33. segPaths []string
  34. batchSize int
  35. mergeMax int
  36. batch *index.Batch
  37. internal map[string][]byte
  38. segPlugin segment.Plugin
  39. }
  40. func NewBuilder(config map[string]interface{}) (*Builder, error) {
  41. path, ok := config["path"].(string)
  42. if !ok {
  43. return nil, fmt.Errorf("must specify path")
  44. }
  45. buildPathPrefix, _ := config["buildPathPrefix"].(string)
  46. buildPath, err := ioutil.TempDir(buildPathPrefix, "scorch-offline-build")
  47. if err != nil {
  48. return nil, err
  49. }
  50. rv := &Builder{
  51. path: path,
  52. buildPath: buildPath,
  53. mergeMax: DefaultBuilderMergeMax,
  54. batchSize: DefaultBuilderBatchSize,
  55. batch: index.NewBatch(),
  56. segPlugin: defaultSegmentPlugin,
  57. }
  58. err = rv.parseConfig(config)
  59. if err != nil {
  60. return nil, fmt.Errorf("error parsing builder config: %v", err)
  61. }
  62. return rv, nil
  63. }
  64. func (o *Builder) parseConfig(config map[string]interface{}) (err error) {
  65. if v, ok := config["mergeMax"]; ok {
  66. var t int
  67. if t, err = parseToInteger(v); err != nil {
  68. return fmt.Errorf("mergeMax parse err: %v", err)
  69. }
  70. if t > 0 {
  71. o.mergeMax = t
  72. }
  73. }
  74. if v, ok := config["batchSize"]; ok {
  75. var t int
  76. if t, err = parseToInteger(v); err != nil {
  77. return fmt.Errorf("batchSize parse err: %v", err)
  78. }
  79. if t > 0 {
  80. o.batchSize = t
  81. }
  82. }
  83. if v, ok := config["internal"]; ok {
  84. if vinternal, ok := v.(map[string][]byte); ok {
  85. o.internal = vinternal
  86. }
  87. }
  88. forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
  89. if err != nil {
  90. return err
  91. }
  92. if forcedSegmentType != "" && forcedSegmentVersion != 0 {
  93. segPlugin, err := chooseSegmentPlugin(forcedSegmentType,
  94. uint32(forcedSegmentVersion))
  95. if err != nil {
  96. return err
  97. }
  98. o.segPlugin = segPlugin
  99. }
  100. return nil
  101. }
  102. // Index will place the document into the index.
  103. // It is invalid to index the same document multiple times.
  104. func (o *Builder) Index(doc *document.Document) error {
  105. o.m.Lock()
  106. defer o.m.Unlock()
  107. o.batch.Update(doc)
  108. return o.maybeFlushBatchLOCKED(o.batchSize)
  109. }
  110. func (o *Builder) maybeFlushBatchLOCKED(moreThan int) error {
  111. if len(o.batch.IndexOps) >= moreThan {
  112. defer o.batch.Reset()
  113. return o.executeBatchLOCKED(o.batch)
  114. }
  115. return nil
  116. }
  117. func (o *Builder) executeBatchLOCKED(batch *index.Batch) (err error) {
  118. analysisResults := make([]*index.AnalysisResult, 0, len(batch.IndexOps))
  119. for _, doc := range batch.IndexOps {
  120. if doc != nil {
  121. // insert _id field
  122. doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil))
  123. // perform analysis directly
  124. analysisResult := analyze(doc)
  125. analysisResults = append(analysisResults, analysisResult)
  126. }
  127. }
  128. seg, _, err := o.segPlugin.New(analysisResults)
  129. if err != nil {
  130. return fmt.Errorf("error building segment base: %v", err)
  131. }
  132. filename := zapFileName(o.segCount)
  133. o.segCount++
  134. path := o.buildPath + string(os.PathSeparator) + filename
  135. if segUnpersisted, ok := seg.(segment.UnpersistedSegment); ok {
  136. err = segUnpersisted.Persist(path)
  137. if err != nil {
  138. return fmt.Errorf("error persisting segment base to %s: %v", path, err)
  139. }
  140. o.segPaths = append(o.segPaths, path)
  141. return nil
  142. }
  143. return fmt.Errorf("new segment does not implement unpersisted: %T", seg)
  144. }
  145. func (o *Builder) doMerge() error {
  146. // as long as we have more than 1 segment, keep merging
  147. for len(o.segPaths) > 1 {
  148. // merge the next <mergeMax> number of segments into one new one
  149. // or, if there are fewer than <mergeMax> remaining, merge them all
  150. mergeCount := o.mergeMax
  151. if mergeCount > len(o.segPaths) {
  152. mergeCount = len(o.segPaths)
  153. }
  154. mergePaths := o.segPaths[0:mergeCount]
  155. o.segPaths = o.segPaths[mergeCount:]
  156. // open each of the segments to be merged
  157. mergeSegs := make([]segment.Segment, 0, mergeCount)
  158. // closeOpenedSegs attempts to close all opened
  159. // segments even if an error occurs, in which case
  160. // the first error is returned
  161. closeOpenedSegs := func() error {
  162. var err error
  163. for _, seg := range mergeSegs {
  164. clErr := seg.Close()
  165. if clErr != nil && err == nil {
  166. err = clErr
  167. }
  168. }
  169. return err
  170. }
  171. for _, mergePath := range mergePaths {
  172. seg, err := o.segPlugin.Open(mergePath)
  173. if err != nil {
  174. _ = closeOpenedSegs()
  175. return fmt.Errorf("error opening segment (%s) for merge: %v", mergePath, err)
  176. }
  177. mergeSegs = append(mergeSegs, seg)
  178. }
  179. // do the merge
  180. mergedSegPath := o.buildPath + string(os.PathSeparator) + zapFileName(o.segCount)
  181. drops := make([]*roaring.Bitmap, mergeCount)
  182. _, _, err := o.segPlugin.Merge(mergeSegs, drops, mergedSegPath, nil, nil)
  183. if err != nil {
  184. _ = closeOpenedSegs()
  185. return fmt.Errorf("error merging segments (%v): %v", mergePaths, err)
  186. }
  187. o.segCount++
  188. o.segPaths = append(o.segPaths, mergedSegPath)
  189. // close segments opened for merge
  190. err = closeOpenedSegs()
  191. if err != nil {
  192. return fmt.Errorf("error closing opened segments: %v", err)
  193. }
  194. // remove merged segments
  195. for _, mergePath := range mergePaths {
  196. err = os.RemoveAll(mergePath)
  197. if err != nil {
  198. return fmt.Errorf("error removing segment %s after merge: %v", mergePath, err)
  199. }
  200. }
  201. }
  202. return nil
  203. }
  204. func (o *Builder) Close() error {
  205. o.m.Lock()
  206. defer o.m.Unlock()
  207. // see if there is a partial batch
  208. err := o.maybeFlushBatchLOCKED(1)
  209. if err != nil {
  210. return fmt.Errorf("error flushing batch before close: %v", err)
  211. }
  212. // perform all the merging
  213. err = o.doMerge()
  214. if err != nil {
  215. return fmt.Errorf("error while merging: %v", err)
  216. }
  217. // ensure the store path exists
  218. err = os.MkdirAll(o.path, 0700)
  219. if err != nil {
  220. return err
  221. }
  222. // move final segment into place
  223. // segment id 2 is chosen to match the behavior of a scorch
  224. // index which indexes a single batch of data
  225. finalSegPath := o.path + string(os.PathSeparator) + zapFileName(2)
  226. err = os.Rename(o.segPaths[0], finalSegPath)
  227. if err != nil {
  228. return fmt.Errorf("error moving final segment into place: %v", err)
  229. }
  230. // remove the buildPath, as it is no longer needed
  231. err = os.RemoveAll(o.buildPath)
  232. if err != nil {
  233. return fmt.Errorf("error removing build path: %v", err)
  234. }
  235. // prepare wrapping
  236. seg, err := o.segPlugin.Open(finalSegPath)
  237. if err != nil {
  238. return fmt.Errorf("error opening final segment")
  239. }
  240. // create a segment snapshot for this segment
  241. ss := &SegmentSnapshot{
  242. segment: seg,
  243. }
  244. is := &IndexSnapshot{
  245. epoch: 3, // chosen to match scorch behavior when indexing a single batch
  246. segment: []*SegmentSnapshot{ss},
  247. creator: "scorch-builder",
  248. internal: o.internal,
  249. }
  250. // create the root bolt
  251. rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt"
  252. rootBolt, err := bolt.Open(rootBoltPath, 0600, nil)
  253. if err != nil {
  254. return err
  255. }
  256. // start a write transaction
  257. tx, err := rootBolt.Begin(true)
  258. if err != nil {
  259. return err
  260. }
  261. // fill the root bolt with this fake index snapshot
  262. _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin)
  263. if err != nil {
  264. _ = tx.Rollback()
  265. _ = rootBolt.Close()
  266. return fmt.Errorf("error preparing bolt snapshot in root.bolt: %v", err)
  267. }
  268. // commit bolt data
  269. err = tx.Commit()
  270. if err != nil {
  271. _ = rootBolt.Close()
  272. return fmt.Errorf("error committing bolt tx in root.bolt: %v", err)
  273. }
  274. // close bolt
  275. err = rootBolt.Close()
  276. if err != nil {
  277. return fmt.Errorf("error closing root.bolt: %v", err)
  278. }
  279. // close final segment
  280. err = seg.Close()
  281. if err != nil {
  282. return fmt.Errorf("error closing final segment: %v", err)
  283. }
  284. return nil
  285. }