You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

246 lines
5.3 KiB

  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package utf8
  15. import (
  16. "fmt"
  17. "unicode/utf8"
  18. )
  19. // Sequences is a collection of Sequence
  20. type Sequences []Sequence
  21. // NewSequences constructs a collection of Sequence which describe the
  22. // byte ranges covered between the start and end runes.
  23. func NewSequences(start, end rune) (Sequences, error) {
  24. var rv Sequences
  25. var rangeStack rangeStack
  26. rangeStack = rangeStack.Push(&scalarRange{start, end})
  27. rangeStack, r := rangeStack.Pop()
  28. TOP:
  29. for r != nil {
  30. INNER:
  31. for {
  32. r1, r2 := r.split()
  33. if r1 != nil {
  34. rangeStack = rangeStack.Push(&scalarRange{r2.start, r2.end})
  35. r.start = r1.start
  36. r.end = r1.end
  37. continue INNER
  38. }
  39. if !r.valid() {
  40. rangeStack, r = rangeStack.Pop()
  41. continue TOP
  42. }
  43. for i := 1; i < utf8.UTFMax; i++ {
  44. max := maxScalarValue(i)
  45. if r.start <= max && max < r.end {
  46. rangeStack = rangeStack.Push(&scalarRange{max + 1, r.end})
  47. r.end = max
  48. continue INNER
  49. }
  50. }
  51. asciiRange := r.ascii()
  52. if asciiRange != nil {
  53. rv = append(rv, Sequence{
  54. asciiRange,
  55. })
  56. rangeStack, r = rangeStack.Pop()
  57. continue TOP
  58. }
  59. for i := uint(1); i < utf8.UTFMax; i++ {
  60. m := rune((1 << (6 * i)) - 1)
  61. if (r.start & ^m) != (r.end & ^m) {
  62. if (r.start & m) != 0 {
  63. rangeStack = rangeStack.Push(&scalarRange{(r.start | m) + 1, r.end})
  64. r.end = r.start | m
  65. continue INNER
  66. }
  67. if (r.end & m) != m {
  68. rangeStack = rangeStack.Push(&scalarRange{r.end & ^m, r.end})
  69. r.end = (r.end & ^m) - 1
  70. continue INNER
  71. }
  72. }
  73. }
  74. start := make([]byte, utf8.UTFMax)
  75. end := make([]byte, utf8.UTFMax)
  76. n, m := r.encode(start, end)
  77. seq, err := SequenceFromEncodedRange(start[0:n], end[0:m])
  78. if err != nil {
  79. return nil, err
  80. }
  81. rv = append(rv, seq)
  82. rangeStack, r = rangeStack.Pop()
  83. continue TOP
  84. }
  85. }
  86. return rv, nil
  87. }
  88. // Sequence is a collection of *Range
  89. type Sequence []*Range
  90. // SequenceFromEncodedRange creates sequence from the encoded bytes
  91. func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
  92. if len(start) != len(end) {
  93. return nil, fmt.Errorf("byte slices must be the same length")
  94. }
  95. switch len(start) {
  96. case 2:
  97. return Sequence{
  98. &Range{start[0], end[0]},
  99. &Range{start[1], end[1]},
  100. }, nil
  101. case 3:
  102. return Sequence{
  103. &Range{start[0], end[0]},
  104. &Range{start[1], end[1]},
  105. &Range{start[2], end[2]},
  106. }, nil
  107. case 4:
  108. return Sequence{
  109. &Range{start[0], end[0]},
  110. &Range{start[1], end[1]},
  111. &Range{start[2], end[2]},
  112. &Range{start[3], end[3]},
  113. }, nil
  114. }
  115. return nil, fmt.Errorf("invalid encoded byte length")
  116. }
  117. // Matches checks to see if the provided byte slice matches the Sequence
  118. func (u Sequence) Matches(bytes []byte) bool {
  119. if len(bytes) < len(u) {
  120. return false
  121. }
  122. for i := 0; i < len(u); i++ {
  123. if !u[i].matches(bytes[i]) {
  124. return false
  125. }
  126. }
  127. return true
  128. }
  129. func (u Sequence) String() string {
  130. switch len(u) {
  131. case 1:
  132. return fmt.Sprintf("%v", u[0])
  133. case 2:
  134. return fmt.Sprintf("%v%v", u[0], u[1])
  135. case 3:
  136. return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
  137. case 4:
  138. return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
  139. default:
  140. return fmt.Sprintf("invalid utf8 sequence")
  141. }
  142. }
  143. // Range describes a single range of byte values
  144. type Range struct {
  145. Start byte
  146. End byte
  147. }
  148. func (u Range) matches(b byte) bool {
  149. if u.Start <= b && b <= u.End {
  150. return true
  151. }
  152. return false
  153. }
  154. func (u Range) String() string {
  155. if u.Start == u.End {
  156. return fmt.Sprintf("[%X]", u.Start)
  157. }
  158. return fmt.Sprintf("[%X-%X]", u.Start, u.End)
  159. }
  160. type scalarRange struct {
  161. start rune
  162. end rune
  163. }
  164. func (s *scalarRange) String() string {
  165. return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
  166. }
  167. // split this scalar range if it overlaps with a surrogate codepoint
  168. func (s *scalarRange) split() (*scalarRange, *scalarRange) {
  169. if s.start < 0xe000 && s.end > 0xd7ff {
  170. return &scalarRange{
  171. start: s.start,
  172. end: 0xd7ff,
  173. },
  174. &scalarRange{
  175. start: 0xe000,
  176. end: s.end,
  177. }
  178. }
  179. return nil, nil
  180. }
  181. func (s *scalarRange) valid() bool {
  182. return s.start <= s.end
  183. }
  184. func (s *scalarRange) ascii() *Range {
  185. if s.valid() && s.end <= 0x7f {
  186. return &Range{
  187. Start: byte(s.start),
  188. End: byte(s.end),
  189. }
  190. }
  191. return nil
  192. }
  193. // start and end MUST have capacity for utf8.UTFMax bytes
  194. func (s *scalarRange) encode(start, end []byte) (int, int) {
  195. n := utf8.EncodeRune(start, s.start)
  196. m := utf8.EncodeRune(end, s.end)
  197. return n, m
  198. }
  199. type rangeStack []*scalarRange
  200. func (s rangeStack) Push(v *scalarRange) rangeStack {
  201. return append(s, v)
  202. }
  203. func (s rangeStack) Pop() (rangeStack, *scalarRange) {
  204. l := len(s)
  205. if l < 1 {
  206. return s, nil
  207. }
  208. return s[:l-1], s[l-1]
  209. }
  210. func maxScalarValue(nbytes int) rune {
  211. switch nbytes {
  212. case 1:
  213. return 0x007f
  214. case 2:
  215. return 0x07FF
  216. case 3:
  217. return 0xFFFF
  218. default:
  219. return 0x10FFFF
  220. }
  221. }