You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
13 KiB

  1. package yaml
  2. import (
  3. "io"
  4. )
  5. // Set the reader error and return 0.
  6. func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string, offset int, value int) bool {
  7. parser.error = yaml_READER_ERROR
  8. parser.problem = problem
  9. parser.problem_offset = offset
  10. parser.problem_value = value
  11. return false
  12. }
  13. // Byte order marks.
  14. const (
  15. bom_UTF8 = "\xef\xbb\xbf"
  16. bom_UTF16LE = "\xff\xfe"
  17. bom_UTF16BE = "\xfe\xff"
  18. )
  19. // Determine the input stream encoding by checking the BOM symbol. If no BOM is
  20. // found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  21. func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
  22. // Ensure that we had enough bytes in the raw buffer.
  23. for !parser.eof && len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
  24. if !yaml_parser_update_raw_buffer(parser) {
  25. return false
  26. }
  27. }
  28. // Determine the encoding.
  29. buf := parser.raw_buffer
  30. pos := parser.raw_buffer_pos
  31. avail := len(buf) - pos
  32. if avail >= 2 && buf[pos] == bom_UTF16LE[0] && buf[pos+1] == bom_UTF16LE[1] {
  33. parser.encoding = yaml_UTF16LE_ENCODING
  34. parser.raw_buffer_pos += 2
  35. parser.offset += 2
  36. } else if avail >= 2 && buf[pos] == bom_UTF16BE[0] && buf[pos+1] == bom_UTF16BE[1] {
  37. parser.encoding = yaml_UTF16BE_ENCODING
  38. parser.raw_buffer_pos += 2
  39. parser.offset += 2
  40. } else if avail >= 3 && buf[pos] == bom_UTF8[0] && buf[pos+1] == bom_UTF8[1] && buf[pos+2] == bom_UTF8[2] {
  41. parser.encoding = yaml_UTF8_ENCODING
  42. parser.raw_buffer_pos += 3
  43. parser.offset += 3
  44. } else {
  45. parser.encoding = yaml_UTF8_ENCODING
  46. }
  47. return true
  48. }
  49. // Update the raw buffer.
  50. func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
  51. size_read := 0
  52. // Return if the raw buffer is full.
  53. if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
  54. return true
  55. }
  56. // Return on EOF.
  57. if parser.eof {
  58. return true
  59. }
  60. // Move the remaining bytes in the raw buffer to the beginning.
  61. if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
  62. copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
  63. }
  64. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
  65. parser.raw_buffer_pos = 0
  66. // Call the read handler to fill the buffer.
  67. size_read, err := parser.read_handler(parser, parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
  68. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
  69. if err == io.EOF {
  70. parser.eof = true
  71. } else if err != nil {
  72. return yaml_parser_set_reader_error(parser, "input error: "+err.Error(), parser.offset, -1)
  73. }
  74. return true
  75. }
  76. // Ensure that the buffer contains at least `length` characters.
  77. // Return true on success, false on failure.
  78. //
  79. // The length is supposed to be significantly less that the buffer size.
  80. func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
  81. if parser.read_handler == nil {
  82. panic("read handler must be set")
  83. }
  84. // [Go] This function was changed to guarantee the requested length size at EOF.
  85. // The fact we need to do this is pretty awful, but the description above implies
  86. // for that to be the case, and there are tests
  87. // If the EOF flag is set and the raw buffer is empty, do nothing.
  88. if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
  89. // [Go] ACTUALLY! Read the documentation of this function above.
  90. // This is just broken. To return true, we need to have the
  91. // given length in the buffer. Not doing that means every single
  92. // check that calls this function to make sure the buffer has a
  93. // given length is Go) panicking; or C) accessing invalid memory.
  94. //return true
  95. }
  96. // Return if the buffer contains enough characters.
  97. if parser.unread >= length {
  98. return true
  99. }
  100. // Determine the input encoding if it is not known yet.
  101. if parser.encoding == yaml_ANY_ENCODING {
  102. if !yaml_parser_determine_encoding(parser) {
  103. return false
  104. }
  105. }
  106. // Move the unread characters to the beginning of the buffer.
  107. buffer_len := len(parser.buffer)
  108. if parser.buffer_pos > 0 && parser.buffer_pos < buffer_len {
  109. copy(parser.buffer, parser.buffer[parser.buffer_pos:])
  110. buffer_len -= parser.buffer_pos
  111. parser.buffer_pos = 0
  112. } else if parser.buffer_pos == buffer_len {
  113. buffer_len = 0
  114. parser.buffer_pos = 0
  115. }
  116. // Open the whole buffer for writing, and cut it before returning.
  117. parser.buffer = parser.buffer[:cap(parser.buffer)]
  118. // Fill the buffer until it has enough characters.
  119. first := true
  120. for parser.unread < length {
  121. // Fill the raw buffer if necessary.
  122. if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
  123. if !yaml_parser_update_raw_buffer(parser) {
  124. parser.buffer = parser.buffer[:buffer_len]
  125. return false
  126. }
  127. }
  128. first = false
  129. // Decode the raw buffer.
  130. inner:
  131. for parser.raw_buffer_pos != len(parser.raw_buffer) {
  132. var value rune
  133. var width int
  134. raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
  135. // Decode the next character.
  136. switch parser.encoding {
  137. case yaml_UTF8_ENCODING:
  138. // Decode a UTF-8 character. Check RFC 3629
  139. // (http://www.ietf.org/rfc/rfc3629.txt) for more details.
  140. //
  141. // The following table (taken from the RFC) is used for
  142. // decoding.
  143. //
  144. // Char. number range | UTF-8 octet sequence
  145. // (hexadecimal) | (binary)
  146. // --------------------+------------------------------------
  147. // 0000 0000-0000 007F | 0xxxxxxx
  148. // 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  149. // 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  150. // 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  151. //
  152. // Additionally, the characters in the range 0xD800-0xDFFF
  153. // are prohibited as they are reserved for use with UTF-16
  154. // surrogate pairs.
  155. // Determine the length of the UTF-8 sequence.
  156. octet := parser.raw_buffer[parser.raw_buffer_pos]
  157. switch {
  158. case octet&0x80 == 0x00:
  159. width = 1
  160. case octet&0xE0 == 0xC0:
  161. width = 2
  162. case octet&0xF0 == 0xE0:
  163. width = 3
  164. case octet&0xF8 == 0xF0:
  165. width = 4
  166. default:
  167. // The leading octet is invalid.
  168. return yaml_parser_set_reader_error(parser,
  169. "invalid leading UTF-8 octet",
  170. parser.offset, int(octet))
  171. }
  172. // Check if the raw buffer contains an incomplete character.
  173. if width > raw_unread {
  174. if parser.eof {
  175. return yaml_parser_set_reader_error(parser,
  176. "incomplete UTF-8 octet sequence",
  177. parser.offset, -1)
  178. }
  179. break inner
  180. }
  181. // Decode the leading octet.
  182. switch {
  183. case octet&0x80 == 0x00:
  184. value = rune(octet & 0x7F)
  185. case octet&0xE0 == 0xC0:
  186. value = rune(octet & 0x1F)
  187. case octet&0xF0 == 0xE0:
  188. value = rune(octet & 0x0F)
  189. case octet&0xF8 == 0xF0:
  190. value = rune(octet & 0x07)
  191. default:
  192. value = 0
  193. }
  194. // Check and decode the trailing octets.
  195. for k := 1; k < width; k++ {
  196. octet = parser.raw_buffer[parser.raw_buffer_pos+k]
  197. // Check if the octet is valid.
  198. if (octet & 0xC0) != 0x80 {
  199. return yaml_parser_set_reader_error(parser,
  200. "invalid trailing UTF-8 octet",
  201. parser.offset+k, int(octet))
  202. }
  203. // Decode the octet.
  204. value = (value << 6) + rune(octet&0x3F)
  205. }
  206. // Check the length of the sequence against the value.
  207. switch {
  208. case width == 1:
  209. case width == 2 && value >= 0x80:
  210. case width == 3 && value >= 0x800:
  211. case width == 4 && value >= 0x10000:
  212. default:
  213. return yaml_parser_set_reader_error(parser,
  214. "invalid length of a UTF-8 sequence",
  215. parser.offset, -1)
  216. }
  217. // Check the range of the value.
  218. if value >= 0xD800 && value <= 0xDFFF || value > 0x10FFFF {
  219. return yaml_parser_set_reader_error(parser,
  220. "invalid Unicode character",
  221. parser.offset, int(value))
  222. }
  223. case yaml_UTF16LE_ENCODING, yaml_UTF16BE_ENCODING:
  224. var low, high int
  225. if parser.encoding == yaml_UTF16LE_ENCODING {
  226. low, high = 0, 1
  227. } else {
  228. low, high = 1, 0
  229. }
  230. // The UTF-16 encoding is not as simple as one might
  231. // naively think. Check RFC 2781
  232. // (http://www.ietf.org/rfc/rfc2781.txt).
  233. //
  234. // Normally, two subsequent bytes describe a Unicode
  235. // character. However a special technique (called a
  236. // surrogate pair) is used for specifying character
  237. // values larger than 0xFFFF.
  238. //
  239. // A surrogate pair consists of two pseudo-characters:
  240. // high surrogate area (0xD800-0xDBFF)
  241. // low surrogate area (0xDC00-0xDFFF)
  242. //
  243. // The following formulas are used for decoding
  244. // and encoding characters using surrogate pairs:
  245. //
  246. // U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  247. // U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  248. // W1 = 110110yyyyyyyyyy
  249. // W2 = 110111xxxxxxxxxx
  250. //
  251. // where U is the character value, W1 is the high surrogate
  252. // area, W2 is the low surrogate area.
  253. // Check for incomplete UTF-16 character.
  254. if raw_unread < 2 {
  255. if parser.eof {
  256. return yaml_parser_set_reader_error(parser,
  257. "incomplete UTF-16 character",
  258. parser.offset, -1)
  259. }
  260. break inner
  261. }
  262. // Get the character.
  263. value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
  264. (rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
  265. // Check for unexpected low surrogate area.
  266. if value&0xFC00 == 0xDC00 {
  267. return yaml_parser_set_reader_error(parser,
  268. "unexpected low surrogate area",
  269. parser.offset, int(value))
  270. }
  271. // Check for a high surrogate area.
  272. if value&0xFC00 == 0xD800 {
  273. width = 4
  274. // Check for incomplete surrogate pair.
  275. if raw_unread < 4 {
  276. if parser.eof {
  277. return yaml_parser_set_reader_error(parser,
  278. "incomplete UTF-16 surrogate pair",
  279. parser.offset, -1)
  280. }
  281. break inner
  282. }
  283. // Get the next character.
  284. value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
  285. (rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
  286. // Check for a low surrogate area.
  287. if value2&0xFC00 != 0xDC00 {
  288. return yaml_parser_set_reader_error(parser,
  289. "expected low surrogate area",
  290. parser.offset+2, int(value2))
  291. }
  292. // Generate the value of the surrogate pair.
  293. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
  294. } else {
  295. width = 2
  296. }
  297. default:
  298. panic("impossible")
  299. }
  300. // Check if the character is in the allowed range:
  301. // #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  302. // | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  303. // | [#x10000-#x10FFFF] (32 bit)
  304. switch {
  305. case value == 0x09:
  306. case value == 0x0A:
  307. case value == 0x0D:
  308. case value >= 0x20 && value <= 0x7E:
  309. case value == 0x85:
  310. case value >= 0xA0 && value <= 0xD7FF:
  311. case value >= 0xE000 && value <= 0xFFFD:
  312. case value >= 0x10000 && value <= 0x10FFFF:
  313. default:
  314. return yaml_parser_set_reader_error(parser,
  315. "control characters are not allowed",
  316. parser.offset, int(value))
  317. }
  318. // Move the raw pointers.
  319. parser.raw_buffer_pos += width
  320. parser.offset += width
  321. // Finally put the character into the buffer.
  322. if value <= 0x7F {
  323. // 0000 0000-0000 007F . 0xxxxxxx
  324. parser.buffer[buffer_len+0] = byte(value)
  325. buffer_len += 1
  326. } else if value <= 0x7FF {
  327. // 0000 0080-0000 07FF . 110xxxxx 10xxxxxx
  328. parser.buffer[buffer_len+0] = byte(0xC0 + (value >> 6))
  329. parser.buffer[buffer_len+1] = byte(0x80 + (value & 0x3F))
  330. buffer_len += 2
  331. } else if value <= 0xFFFF {
  332. // 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx
  333. parser.buffer[buffer_len+0] = byte(0xE0 + (value >> 12))
  334. parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 6) & 0x3F))
  335. parser.buffer[buffer_len+2] = byte(0x80 + (value & 0x3F))
  336. buffer_len += 3
  337. } else {
  338. // 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  339. parser.buffer[buffer_len+0] = byte(0xF0 + (value >> 18))
  340. parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 12) & 0x3F))
  341. parser.buffer[buffer_len+2] = byte(0x80 + ((value >> 6) & 0x3F))
  342. parser.buffer[buffer_len+3] = byte(0x80 + (value & 0x3F))
  343. buffer_len += 4
  344. }
  345. parser.unread++
  346. }
  347. // On EOF, put NUL into the buffer and return.
  348. if parser.eof {
  349. parser.buffer[buffer_len] = 0
  350. buffer_len++
  351. parser.unread++
  352. break
  353. }
  354. }
  355. // [Go] Read the documentation of this function above. To return true,
  356. // we need to have the given length in the buffer. Not doing that means
  357. // every single check that calls this function to make sure the buffer
  358. // has a given length is Go) panicking; or C) accessing invalid memory.
  359. // This happens here due to the EOF above breaking early.
  360. for buffer_len < length {
  361. parser.buffer[buffer_len] = 0
  362. buffer_len++
  363. }
  364. parser.buffer = parser.buffer[:buffer_len]
  365. return true
  366. }