You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
9.8 KiB

  1. // Copyright (c) 2015 Couchbase, Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
  3. // except in compliance with the License. You may obtain a copy of the License at
  4. // http://www.apache.org/licenses/LICENSE-2.0
  5. // Unless required by applicable law or agreed to in writing, software distributed under the
  6. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  7. // either express or implied. See the License for the specific language governing permissions
  8. // and limitations under the License.
  9. package segment
  10. import (
  11. "errors"
  12. "io"
  13. )
  14. // Autogenerate the following:
  15. // 1. Ragel rules from subset of Unicode script properties
  16. // 2. Ragel rules from Unicode word segmentation properties
  17. // 3. Ragel machine for word segmentation
  18. // 4. Test tables from Unicode
  19. //
  20. // Requires:
  21. // 1. Ruby (to generate ragel rules from unicode spec)
  22. // 2. Ragel (only v6.9 tested)
  23. // 3. sed (to rewrite build tags)
  24. //
  25. //go:generate ragel/unicode2ragel.rb -u http://www.unicode.org/Public/8.0.0/ucd/Scripts.txt -m SCRIPTS -p Hangul,Han,Hiragana -o ragel/uscript.rl
  26. //go:generate ragel/unicode2ragel.rb -u http://www.unicode.org/Public/8.0.0/ucd/auxiliary/WordBreakProperty.txt -m WB -p Double_Quote,Single_Quote,Hebrew_Letter,CR,LF,Newline,Extend,Format,Katakana,ALetter,MidLetter,MidNum,MidNumLet,Numeric,ExtendNumLet,Regional_Indicator -o ragel/uwb.rl
  27. //go:generate ragel -T1 -Z segment_words.rl -o segment_words.go
  28. //go:generate sed -i "" -e "s/BUILDTAGS/!prod/" segment_words.go
  29. //go:generate sed -i "" -e "s/RAGELFLAGS/-T1/" segment_words.go
  30. //go:generate ragel -G2 -Z segment_words.rl -o segment_words_prod.go
  31. //go:generate sed -i "" -e "s/BUILDTAGS/prod/" segment_words_prod.go
  32. //go:generate sed -i "" -e "s/RAGELFLAGS/-G2/" segment_words_prod.go
  33. //go:generate go run maketesttables.go -output tables_test.go
  34. // NewWordSegmenter returns a new Segmenter to read from r.
  35. func NewWordSegmenter(r io.Reader) *Segmenter {
  36. return NewSegmenter(r)
  37. }
  38. // NewWordSegmenterDirect returns a new Segmenter to work directly with buf.
  39. func NewWordSegmenterDirect(buf []byte) *Segmenter {
  40. return NewSegmenterDirect(buf)
  41. }
  42. func SplitWords(data []byte, atEOF bool) (int, []byte, error) {
  43. advance, token, _, err := SegmentWords(data, atEOF)
  44. return advance, token, err
  45. }
  46. func SegmentWords(data []byte, atEOF bool) (int, []byte, int, error) {
  47. vals := make([][]byte, 0, 1)
  48. types := make([]int, 0, 1)
  49. tokens, types, advance, err := segmentWords(data, 1, atEOF, vals, types)
  50. if len(tokens) > 0 {
  51. return advance, tokens[0], types[0], err
  52. }
  53. return advance, nil, 0, err
  54. }
  55. func SegmentWordsDirect(data []byte, val [][]byte, types []int) ([][]byte, []int, int, error) {
  56. return segmentWords(data, -1, true, val, types)
  57. }
  58. // *** Core Segmenter
  59. const maxConsecutiveEmptyReads = 100
  60. // NewSegmenter returns a new Segmenter to read from r.
  61. // Defaults to segment using SegmentWords
  62. func NewSegmenter(r io.Reader) *Segmenter {
  63. return &Segmenter{
  64. r: r,
  65. segment: SegmentWords,
  66. maxTokenSize: MaxScanTokenSize,
  67. buf: make([]byte, 4096), // Plausible starting size; needn't be large.
  68. }
  69. }
  70. // NewSegmenterDirect returns a new Segmenter to work directly with buf.
  71. // Defaults to segment using SegmentWords
  72. func NewSegmenterDirect(buf []byte) *Segmenter {
  73. return &Segmenter{
  74. segment: SegmentWords,
  75. maxTokenSize: MaxScanTokenSize,
  76. buf: buf,
  77. start: 0,
  78. end: len(buf),
  79. err: io.EOF,
  80. }
  81. }
  82. // Segmenter provides a convenient interface for reading data such as
  83. // a file of newline-delimited lines of text. Successive calls to
  84. // the Segment method will step through the 'tokens' of a file, skipping
  85. // the bytes between the tokens. The specification of a token is
  86. // defined by a split function of type SplitFunc; the default split
  87. // function breaks the input into lines with line termination stripped. Split
  88. // functions are defined in this package for scanning a file into
  89. // lines, bytes, UTF-8-encoded runes, and space-delimited words. The
  90. // client may instead provide a custom split function.
  91. //
  92. // Segmenting stops unrecoverably at EOF, the first I/O error, or a token too
  93. // large to fit in the buffer. When a scan stops, the reader may have
  94. // advanced arbitrarily far past the last token. Programs that need more
  95. // control over error handling or large tokens, or must run sequential scans
  96. // on a reader, should use bufio.Reader instead.
  97. //
  98. type Segmenter struct {
  99. r io.Reader // The reader provided by the client.
  100. segment SegmentFunc // The function to split the tokens.
  101. maxTokenSize int // Maximum size of a token; modified by tests.
  102. token []byte // Last token returned by split.
  103. buf []byte // Buffer used as argument to split.
  104. start int // First non-processed byte in buf.
  105. end int // End of data in buf.
  106. typ int // The token type
  107. err error // Sticky error.
  108. }
  109. // SegmentFunc is the signature of the segmenting function used to tokenize the
  110. // input. The arguments are an initial substring of the remaining unprocessed
  111. // data and a flag, atEOF, that reports whether the Reader has no more data
  112. // to give. The return values are the number of bytes to advance the input
  113. // and the next token to return to the user, plus an error, if any. If the
  114. // data does not yet hold a complete token, for instance if it has no newline
  115. // while scanning lines, SegmentFunc can return (0, nil, nil) to signal the
  116. // Segmenter to read more data into the slice and try again with a longer slice
  117. // starting at the same point in the input.
  118. //
  119. // If the returned error is non-nil, segmenting stops and the error
  120. // is returned to the client.
  121. //
  122. // The function is never called with an empty data slice unless atEOF
  123. // is true. If atEOF is true, however, data may be non-empty and,
  124. // as always, holds unprocessed text.
  125. type SegmentFunc func(data []byte, atEOF bool) (advance int, token []byte, segmentType int, err error)
  126. // Errors returned by Segmenter.
  127. var (
  128. ErrTooLong = errors.New("bufio.Segmenter: token too long")
  129. ErrNegativeAdvance = errors.New("bufio.Segmenter: SplitFunc returns negative advance count")
  130. ErrAdvanceTooFar = errors.New("bufio.Segmenter: SplitFunc returns advance count beyond input")
  131. )
  132. const (
  133. // Maximum size used to buffer a token. The actual maximum token size
  134. // may be smaller as the buffer may need to include, for instance, a newline.
  135. MaxScanTokenSize = 64 * 1024
  136. )
  137. // Err returns the first non-EOF error that was encountered by the Segmenter.
  138. func (s *Segmenter) Err() error {
  139. if s.err == io.EOF {
  140. return nil
  141. }
  142. return s.err
  143. }
  144. func (s *Segmenter) Type() int {
  145. return s.typ
  146. }
  147. // Bytes returns the most recent token generated by a call to Segment.
  148. // The underlying array may point to data that will be overwritten
  149. // by a subsequent call to Segment. It does no allocation.
  150. func (s *Segmenter) Bytes() []byte {
  151. return s.token
  152. }
  153. // Text returns the most recent token generated by a call to Segment
  154. // as a newly allocated string holding its bytes.
  155. func (s *Segmenter) Text() string {
  156. return string(s.token)
  157. }
  158. // Segment advances the Segmenter to the next token, which will then be
  159. // available through the Bytes or Text method. It returns false when the
  160. // scan stops, either by reaching the end of the input or an error.
  161. // After Segment returns false, the Err method will return any error that
  162. // occurred during scanning, except that if it was io.EOF, Err
  163. // will return nil.
  164. func (s *Segmenter) Segment() bool {
  165. // Loop until we have a token.
  166. for {
  167. // See if we can get a token with what we already have.
  168. if s.end > s.start {
  169. advance, token, typ, err := s.segment(s.buf[s.start:s.end], s.err != nil)
  170. if err != nil {
  171. s.setErr(err)
  172. return false
  173. }
  174. s.typ = typ
  175. if !s.advance(advance) {
  176. return false
  177. }
  178. s.token = token
  179. if token != nil {
  180. return true
  181. }
  182. }
  183. // We cannot generate a token with what we are holding.
  184. // If we've already hit EOF or an I/O error, we are done.
  185. if s.err != nil {
  186. // Shut it down.
  187. s.start = 0
  188. s.end = 0
  189. return false
  190. }
  191. // Must read more data.
  192. // First, shift data to beginning of buffer if there's lots of empty space
  193. // or space is needed.
  194. if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
  195. copy(s.buf, s.buf[s.start:s.end])
  196. s.end -= s.start
  197. s.start = 0
  198. }
  199. // Is the buffer full? If so, resize.
  200. if s.end == len(s.buf) {
  201. if len(s.buf) >= s.maxTokenSize {
  202. s.setErr(ErrTooLong)
  203. return false
  204. }
  205. newSize := len(s.buf) * 2
  206. if newSize > s.maxTokenSize {
  207. newSize = s.maxTokenSize
  208. }
  209. newBuf := make([]byte, newSize)
  210. copy(newBuf, s.buf[s.start:s.end])
  211. s.buf = newBuf
  212. s.end -= s.start
  213. s.start = 0
  214. continue
  215. }
  216. // Finally we can read some input. Make sure we don't get stuck with
  217. // a misbehaving Reader. Officially we don't need to do this, but let's
  218. // be extra careful: Segmenter is for safe, simple jobs.
  219. for loop := 0; ; {
  220. n, err := s.r.Read(s.buf[s.end:len(s.buf)])
  221. s.end += n
  222. if err != nil {
  223. s.setErr(err)
  224. break
  225. }
  226. if n > 0 {
  227. break
  228. }
  229. loop++
  230. if loop > maxConsecutiveEmptyReads {
  231. s.setErr(io.ErrNoProgress)
  232. break
  233. }
  234. }
  235. }
  236. }
  237. // advance consumes n bytes of the buffer. It reports whether the advance was legal.
  238. func (s *Segmenter) advance(n int) bool {
  239. if n < 0 {
  240. s.setErr(ErrNegativeAdvance)
  241. return false
  242. }
  243. if n > s.end-s.start {
  244. s.setErr(ErrAdvanceTooFar)
  245. return false
  246. }
  247. s.start += n
  248. return true
  249. }
  250. // setErr records the first error encountered.
  251. func (s *Segmenter) setErr(err error) {
  252. if s.err == nil || s.err == io.EOF {
  253. s.err = err
  254. }
  255. }
  256. // SetSegmenter sets the segment function for the Segmenter. If called, it must be
  257. // called before Segment.
  258. func (s *Segmenter) SetSegmenter(segmenter SegmentFunc) {
  259. s.segment = segmenter
  260. }