You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

452 lines
10 KiB

  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package vellum
  15. import (
  16. "bytes"
  17. "io"
  18. )
  19. var defaultBuilderOpts = &BuilderOpts{
  20. Encoder: 1,
  21. RegistryTableSize: 10000,
  22. RegistryMRUSize: 2,
  23. }
  24. // A Builder is used to build a new FST. When possible data is
  25. // streamed out to the underlying Writer as soon as possible.
  26. type Builder struct {
  27. unfinished *unfinishedNodes
  28. registry *registry
  29. last []byte
  30. len int
  31. lastAddr int
  32. encoder encoder
  33. opts *BuilderOpts
  34. builderNodePool *builderNodePool
  35. }
  36. const noneAddr = 1
  37. const emptyAddr = 0
  38. // NewBuilder returns a new Builder which will stream out the
  39. // underlying representation to the provided Writer as the set is built.
  40. func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
  41. if opts == nil {
  42. opts = defaultBuilderOpts
  43. }
  44. builderNodePool := &builderNodePool{}
  45. rv := &Builder{
  46. unfinished: newUnfinishedNodes(builderNodePool),
  47. registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
  48. builderNodePool: builderNodePool,
  49. opts: opts,
  50. lastAddr: noneAddr,
  51. }
  52. var err error
  53. rv.encoder, err = loadEncoder(opts.Encoder, w)
  54. if err != nil {
  55. return nil, err
  56. }
  57. err = rv.encoder.start()
  58. if err != nil {
  59. return nil, err
  60. }
  61. return rv, nil
  62. }
  63. func (b *Builder) Reset(w io.Writer) error {
  64. b.unfinished.Reset()
  65. b.registry.Reset()
  66. b.lastAddr = noneAddr
  67. b.encoder.reset(w)
  68. b.last = nil
  69. b.len = 0
  70. err := b.encoder.start()
  71. if err != nil {
  72. return err
  73. }
  74. return nil
  75. }
  76. // Insert the provided value to the set being built.
  77. // NOTE: values must be inserted in lexicographical order.
  78. func (b *Builder) Insert(key []byte, val uint64) error {
  79. // ensure items are added in lexicographic order
  80. if bytes.Compare(key, b.last) < 0 {
  81. return ErrOutOfOrder
  82. }
  83. if len(key) == 0 {
  84. b.len = 1
  85. b.unfinished.setRootOutput(val)
  86. return nil
  87. }
  88. prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val)
  89. b.len++
  90. err := b.compileFrom(prefixLen)
  91. if err != nil {
  92. return err
  93. }
  94. b.copyLastKey(key)
  95. b.unfinished.addSuffix(key[prefixLen:], out)
  96. return nil
  97. }
  98. func (b *Builder) copyLastKey(key []byte) {
  99. if b.last == nil {
  100. b.last = make([]byte, 0, 64)
  101. } else {
  102. b.last = b.last[:0]
  103. }
  104. b.last = append(b.last, key...)
  105. }
  106. // Close MUST be called after inserting all values.
  107. func (b *Builder) Close() error {
  108. err := b.compileFrom(0)
  109. if err != nil {
  110. return err
  111. }
  112. root := b.unfinished.popRoot()
  113. rootAddr, err := b.compile(root)
  114. if err != nil {
  115. return err
  116. }
  117. return b.encoder.finish(b.len, rootAddr)
  118. }
  119. func (b *Builder) compileFrom(iState int) error {
  120. addr := noneAddr
  121. for iState+1 < len(b.unfinished.stack) {
  122. var node *builderNode
  123. if addr == noneAddr {
  124. node = b.unfinished.popEmpty()
  125. } else {
  126. node = b.unfinished.popFreeze(addr)
  127. }
  128. var err error
  129. addr, err = b.compile(node)
  130. if err != nil {
  131. return nil
  132. }
  133. }
  134. b.unfinished.topLastFreeze(addr)
  135. return nil
  136. }
  137. func (b *Builder) compile(node *builderNode) (int, error) {
  138. if node.final && len(node.trans) == 0 &&
  139. node.finalOutput == 0 {
  140. return 0, nil
  141. }
  142. found, addr, entry := b.registry.entry(node)
  143. if found {
  144. return addr, nil
  145. }
  146. addr, err := b.encoder.encodeState(node, b.lastAddr)
  147. if err != nil {
  148. return 0, err
  149. }
  150. b.lastAddr = addr
  151. entry.addr = addr
  152. return addr, nil
  153. }
  154. type unfinishedNodes struct {
  155. stack []*builderNodeUnfinished
  156. // cache allocates a reasonable number of builderNodeUnfinished
  157. // objects up front and tries to keep reusing them
  158. // because the main data structure is a stack, we assume the
  159. // same access pattern, and don't track items separately
  160. // this means calls get() and pushXYZ() must be paired,
  161. // as well as calls put() and popXYZ()
  162. cache []builderNodeUnfinished
  163. builderNodePool *builderNodePool
  164. }
  165. func (u *unfinishedNodes) Reset() {
  166. u.stack = u.stack[:0]
  167. for i := 0; i < len(u.cache); i++ {
  168. u.cache[i] = builderNodeUnfinished{}
  169. }
  170. u.pushEmpty(false)
  171. }
  172. func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
  173. rv := &unfinishedNodes{
  174. stack: make([]*builderNodeUnfinished, 0, 64),
  175. cache: make([]builderNodeUnfinished, 64),
  176. builderNodePool: p,
  177. }
  178. rv.pushEmpty(false)
  179. return rv
  180. }
  181. // get new builderNodeUnfinished, reusing cache if possible
  182. func (u *unfinishedNodes) get() *builderNodeUnfinished {
  183. if len(u.stack) < len(u.cache) {
  184. return &u.cache[len(u.stack)]
  185. }
  186. // full now allocate a new one
  187. return &builderNodeUnfinished{}
  188. }
  189. // return builderNodeUnfinished, clearing it for reuse
  190. func (u *unfinishedNodes) put() {
  191. if len(u.stack) >= len(u.cache) {
  192. return
  193. // do nothing, not part of cache
  194. }
  195. u.cache[len(u.stack)] = builderNodeUnfinished{}
  196. }
  197. func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte,
  198. out uint64) (int, uint64) {
  199. var i int
  200. for i < len(key) {
  201. if i >= len(u.stack) {
  202. break
  203. }
  204. var addPrefix uint64
  205. if !u.stack[i].hasLastT {
  206. break
  207. }
  208. if u.stack[i].lastIn == key[i] {
  209. commonPre := outputPrefix(u.stack[i].lastOut, out)
  210. addPrefix = outputSub(u.stack[i].lastOut, commonPre)
  211. out = outputSub(out, commonPre)
  212. u.stack[i].lastOut = commonPre
  213. i++
  214. } else {
  215. break
  216. }
  217. if addPrefix != 0 {
  218. u.stack[i].addOutputPrefix(addPrefix)
  219. }
  220. }
  221. return i, out
  222. }
  223. func (u *unfinishedNodes) pushEmpty(final bool) {
  224. next := u.get()
  225. next.node = u.builderNodePool.Get()
  226. next.node.final = final
  227. u.stack = append(u.stack, next)
  228. }
  229. func (u *unfinishedNodes) popRoot() *builderNode {
  230. l := len(u.stack)
  231. var unfinished *builderNodeUnfinished
  232. u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
  233. rv := unfinished.node
  234. u.put()
  235. return rv
  236. }
  237. func (u *unfinishedNodes) popFreeze(addr int) *builderNode {
  238. l := len(u.stack)
  239. var unfinished *builderNodeUnfinished
  240. u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
  241. unfinished.lastCompiled(addr)
  242. rv := unfinished.node
  243. u.put()
  244. return rv
  245. }
  246. func (u *unfinishedNodes) popEmpty() *builderNode {
  247. l := len(u.stack)
  248. var unfinished *builderNodeUnfinished
  249. u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
  250. rv := unfinished.node
  251. u.put()
  252. return rv
  253. }
  254. func (u *unfinishedNodes) setRootOutput(out uint64) {
  255. u.stack[0].node.final = true
  256. u.stack[0].node.finalOutput = out
  257. }
  258. func (u *unfinishedNodes) topLastFreeze(addr int) {
  259. last := len(u.stack) - 1
  260. u.stack[last].lastCompiled(addr)
  261. }
  262. func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) {
  263. if len(bs) == 0 {
  264. return
  265. }
  266. last := len(u.stack) - 1
  267. u.stack[last].hasLastT = true
  268. u.stack[last].lastIn = bs[0]
  269. u.stack[last].lastOut = out
  270. for _, b := range bs[1:] {
  271. next := u.get()
  272. next.node = u.builderNodePool.Get()
  273. next.hasLastT = true
  274. next.lastIn = b
  275. next.lastOut = 0
  276. u.stack = append(u.stack, next)
  277. }
  278. u.pushEmpty(true)
  279. }
  280. type builderNodeUnfinished struct {
  281. node *builderNode
  282. lastOut uint64
  283. lastIn byte
  284. hasLastT bool
  285. }
  286. func (b *builderNodeUnfinished) lastCompiled(addr int) {
  287. if b.hasLastT {
  288. transIn := b.lastIn
  289. transOut := b.lastOut
  290. b.hasLastT = false
  291. b.lastOut = 0
  292. b.node.trans = append(b.node.trans, transition{
  293. in: transIn,
  294. out: transOut,
  295. addr: addr,
  296. })
  297. }
  298. }
  299. func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) {
  300. if b.node.final {
  301. b.node.finalOutput = outputCat(prefix, b.node.finalOutput)
  302. }
  303. for i := range b.node.trans {
  304. b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out)
  305. }
  306. if b.hasLastT {
  307. b.lastOut = outputCat(prefix, b.lastOut)
  308. }
  309. }
  310. type builderNode struct {
  311. finalOutput uint64
  312. trans []transition
  313. final bool
  314. // intrusive linked list
  315. next *builderNode
  316. }
  317. // reset resets the receiver builderNode to a re-usable state.
  318. func (n *builderNode) reset() {
  319. n.final = false
  320. n.finalOutput = 0
  321. for i := range n.trans {
  322. n.trans[i] = emptyTransition
  323. }
  324. n.trans = n.trans[:0]
  325. n.next = nil
  326. }
  327. func (n *builderNode) equiv(o *builderNode) bool {
  328. if n.final != o.final {
  329. return false
  330. }
  331. if n.finalOutput != o.finalOutput {
  332. return false
  333. }
  334. if len(n.trans) != len(o.trans) {
  335. return false
  336. }
  337. for i, ntrans := range n.trans {
  338. otrans := o.trans[i]
  339. if ntrans.in != otrans.in {
  340. return false
  341. }
  342. if ntrans.addr != otrans.addr {
  343. return false
  344. }
  345. if ntrans.out != otrans.out {
  346. return false
  347. }
  348. }
  349. return true
  350. }
  351. var emptyTransition = transition{}
  352. type transition struct {
  353. out uint64
  354. addr int
  355. in byte
  356. }
  357. func outputPrefix(l, r uint64) uint64 {
  358. if l < r {
  359. return l
  360. }
  361. return r
  362. }
  363. func outputSub(l, r uint64) uint64 {
  364. return l - r
  365. }
  366. func outputCat(l, r uint64) uint64 {
  367. return l + r
  368. }
  369. // builderNodePool pools builderNodes using a singly linked list.
  370. //
  371. // NB: builderNode lifecylce is described by the following interactions -
  372. // +------------------------+ +----------------------+
  373. // | Unfinished Nodes | Transfer once | Registry |
  374. // |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
  375. // +------------------------+ marked frozen +----------------------+
  376. // ^ |
  377. // | |
  378. // | Put()
  379. // | Get() on +-------------------+ when
  380. // +-new char--------| builderNode Pool |<-----------evicted
  381. // +-------------------+
  382. type builderNodePool struct {
  383. head *builderNode
  384. }
  385. func (p *builderNodePool) Get() *builderNode {
  386. if p.head == nil {
  387. return &builderNode{}
  388. }
  389. head := p.head
  390. p.head = p.head.next
  391. return head
  392. }
  393. func (p *builderNodePool) Put(v *builderNode) {
  394. if v == nil {
  395. return
  396. }
  397. v.reset()
  398. v.next = p.head
  399. p.head = v
  400. }