You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

244 lines
6.8 KiB

  1. # frozen_string_literal: true
  2. require 'singleton'
  3. # See also `app/javascript/features/account/util/bio_metadata.js`.
  4. class FrontmatterHandler
  5. include Singleton
  6. # CONVENIENCE FUNCTIONS #
  7. def self.unirex(str)
  8. Regexp.new str, Regexp::MULTILINE, 'u'
  9. end
  10. def self.rexstr(exp)
  11. '(?:' + exp.source + ')'
  12. end
  13. # CHARACTER CLASSES #
  14. DOCUMENT_START = /^/
  15. DOCUMENT_END = /$/
  16. ALLOWED_CHAR = # c-printable` in the YAML 1.2 spec.
  17. /[\t\n\r\u{20}-\u{7e}\u{85}\u{a0}-\u{d7ff}\u{e000}-\u{fffd}\u{10000}-\u{10ffff}]/u
  18. WHITE_SPACE = /[ \t]/
  19. INDENTATION = / */
  20. LINE_BREAK = /\r?\n|\r|<br\s*\/?>/
  21. ESCAPE_CHAR = /[0abt\tnvfre "\/\\N_LP]/
  22. HEXADECIMAL_CHARS = /[0-9a-fA-F]/
  23. INDICATOR = /[-?:,\[\]{}&#*!|>'"%@`]/
  24. FLOW_CHAR = /[,\[\]{}]/
  25. # NEGATED CHARACTER CLASSES #
  26. NOT_WHITE_SPACE = unirex '(?!' + rexstr(WHITE_SPACE) + ').'
  27. NOT_LINE_BREAK = unirex '(?!' + rexstr(LINE_BREAK) + ').'
  28. NOT_INDICATOR = unirex '(?!' + rexstr(INDICATOR) + ').'
  29. NOT_FLOW_CHAR = unirex '(?!' + rexstr(FLOW_CHAR) + ').'
  30. NOT_ALLOWED_CHAR = unirex '(?!' + rexstr(ALLOWED_CHAR) + ').'
  31. # BASIC CONSTRUCTS #
  32. ANY_WHITE_SPACE = unirex rexstr(WHITE_SPACE) + '*'
  33. ANY_ALLOWED_CHARS = unirex rexstr(ALLOWED_CHAR) + '*'
  34. NEW_LINE = unirex(
  35. rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK)
  36. )
  37. SOME_NEW_LINES = unirex(
  38. '(?:' + rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK) + ')+'
  39. )
  40. POSSIBLE_STARTS = unirex(
  41. rexstr(DOCUMENT_START) + rexstr(/<p[^<>]*>/) + '?'
  42. )
  43. POSSIBLE_ENDS = unirex(
  44. rexstr(SOME_NEW_LINES) + '|' +
  45. rexstr(DOCUMENT_END) + '|' +
  46. rexstr(/<\/p>/)
  47. )
  48. CHARACTER_ESCAPE = unirex(
  49. rexstr(/\\/) +
  50. '(?:' +
  51. rexstr(ESCAPE_CHAR) + '|' +
  52. rexstr(/x/) + rexstr(HEXADECIMAL_CHARS) + '{2}' + '|' +
  53. rexstr(/u/) + rexstr(HEXADECIMAL_CHARS) + '{4}' + '|' +
  54. rexstr(/U/) + rexstr(HEXADECIMAL_CHARS) + '{8}' +
  55. ')'
  56. )
  57. ESCAPED_CHAR = unirex(
  58. rexstr(/(?!["\\])/) + rexstr(NOT_LINE_BREAK) + '|' +
  59. rexstr(CHARACTER_ESCAPE)
  60. )
  61. ANY_ESCAPED_CHARS = unirex(
  62. rexstr(ESCAPED_CHAR) + '*'
  63. )
  64. ESCAPED_APOS = unirex(
  65. '(?=' + rexstr(NOT_LINE_BREAK) + ')' + rexstr(/[^']|''/)
  66. )
  67. ANY_ESCAPED_APOS = unirex(
  68. rexstr(ESCAPED_APOS) + '*'
  69. )
  70. FIRST_KEY_CHAR = unirex(
  71. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  72. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  73. rexstr(NOT_INDICATOR) + '|' +
  74. rexstr(/[?:-]/) +
  75. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  76. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  77. '(?=' + rexstr(NOT_FLOW_CHAR) + ')'
  78. )
  79. FIRST_VALUE_CHAR = unirex(
  80. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  81. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  82. rexstr(NOT_INDICATOR) + '|' +
  83. rexstr(/[?:-]/) +
  84. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  85. '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  86. # Flow indicators are allowed in values.
  87. )
  88. LATER_KEY_CHAR = unirex(
  89. rexstr(WHITE_SPACE) + '|' +
  90. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  91. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  92. '(?=' + rexstr(NOT_FLOW_CHAR) + ')' +
  93. rexstr(/[^:#]#?/) + '|' +
  94. rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  95. )
  96. LATER_VALUE_CHAR = unirex(
  97. rexstr(WHITE_SPACE) + '|' +
  98. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  99. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  100. # Flow indicators are allowed in values.
  101. rexstr(/[^:#]#?/) + '|' +
  102. rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  103. )
  104. # YAML CONSTRUCTS #
  105. YAML_START = unirex(
  106. rexstr(ANY_WHITE_SPACE) + rexstr(/---/)
  107. )
  108. YAML_END = unirex(
  109. rexstr(ANY_WHITE_SPACE) + rexstr(/(?:---|\.\.\.)/)
  110. )
  111. YAML_LOOKAHEAD = unirex(
  112. '(?=' +
  113. rexstr(YAML_START) +
  114. rexstr(ANY_ALLOWED_CHARS) + rexstr(NEW_LINE) +
  115. rexstr(YAML_END) + rexstr(POSSIBLE_ENDS) +
  116. ')'
  117. )
  118. YAML_DOUBLE_QUOTE = unirex(
  119. rexstr(/"/) + rexstr(ANY_ESCAPED_CHARS) + rexstr(/"/)
  120. )
  121. YAML_SINGLE_QUOTE = unirex(
  122. rexstr(/'/) + rexstr(ANY_ESCAPED_APOS) + rexstr(/'/)
  123. )
  124. YAML_SIMPLE_KEY = unirex(
  125. rexstr(FIRST_KEY_CHAR) + rexstr(LATER_KEY_CHAR) + '*'
  126. )
  127. YAML_SIMPLE_VALUE = unirex(
  128. rexstr(FIRST_VALUE_CHAR) + rexstr(LATER_VALUE_CHAR) + '*'
  129. )
  130. YAML_KEY = unirex(
  131. rexstr(YAML_DOUBLE_QUOTE) + '|' +
  132. rexstr(YAML_SINGLE_QUOTE) + '|' +
  133. rexstr(YAML_SIMPLE_KEY)
  134. )
  135. YAML_VALUE = unirex(
  136. rexstr(YAML_DOUBLE_QUOTE) + '|' +
  137. rexstr(YAML_SINGLE_QUOTE) + '|' +
  138. rexstr(YAML_SIMPLE_VALUE)
  139. )
  140. YAML_SEPARATOR = unirex(
  141. rexstr(ANY_WHITE_SPACE) +
  142. ':' + rexstr(WHITE_SPACE) +
  143. rexstr(ANY_WHITE_SPACE)
  144. )
  145. YAML_LINE = unirex(
  146. '(' + rexstr(YAML_KEY) + ')' +
  147. rexstr(YAML_SEPARATOR) +
  148. '(' + rexstr(YAML_VALUE) + ')'
  149. )
  150. # FRONTMATTER REGEX #
  151. YAML_FRONTMATTER = unirex(
  152. rexstr(POSSIBLE_STARTS) +
  153. rexstr(YAML_LOOKAHEAD) +
  154. rexstr(YAML_START) + rexstr(SOME_NEW_LINES) +
  155. '(?:' +
  156. '(' + rexstr(INDENTATION) + ')' +
  157. rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
  158. '(?:' +
  159. '\\1' + rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
  160. '){0,4}' +
  161. ')?' +
  162. rexstr(YAML_END) + rexstr(POSSIBLE_ENDS)
  163. )
  164. # SEARCHES #
  165. FIND_YAML_LINES = unirex(
  166. rexstr(NEW_LINE) + rexstr(INDENTATION) + rexstr(YAML_LINE)
  167. )
  168. # STRING PROCESSING #
  169. def process_string(str)
  170. case str[0]
  171. when '"'
  172. str[1..-2]
  173. .gsub(/\\0/, "\u{00}")
  174. .gsub(/\\a/, "\u{07}")
  175. .gsub(/\\b/, "\u{08}")
  176. .gsub(/\\t/, "\u{09}")
  177. .gsub(/\\\u{09}/, "\u{09}")
  178. .gsub(/\\n/, "\u{0a}")
  179. .gsub(/\\v/, "\u{0b}")
  180. .gsub(/\\f/, "\u{0c}")
  181. .gsub(/\\r/, "\u{0d}")
  182. .gsub(/\\e/, "\u{1b}")
  183. .gsub(/\\ /, "\u{20}")
  184. .gsub(/\\"/, "\u{22}")
  185. .gsub(/\\\//, "\u{2f}")
  186. .gsub(/\\\\/, "\u{5c}")
  187. .gsub(/\\N/, "\u{85}")
  188. .gsub(/\\_/, "\u{a0}")
  189. .gsub(/\\L/, "\u{2028}")
  190. .gsub(/\\P/, "\u{2029}")
  191. .gsub(/\\x([0-9a-fA-F]{2})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
  192. .gsub(/\\u([0-9a-fA-F]{4})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
  193. .gsub(/\\U([0-9a-fA-F]{8})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
  194. when "'"
  195. str[1..-2].gsub(/''/, "'")
  196. else
  197. str
  198. end
  199. end
  200. # BIO PROCESSING #
  201. def process_bio content
  202. result = {
  203. text: content.gsub(/&quot;/, '"').gsub(/&apos;/, "'"),
  204. metadata: []
  205. }
  206. yaml = YAML_FRONTMATTER.match(result[:text])
  207. return result unless yaml
  208. yaml = yaml[0]
  209. start = YAML_START =~ result[:text]
  210. ending = start + yaml.length - (YAML_START =~ yaml)
  211. result[:text][start..ending - 1] = ''
  212. metadata = nil
  213. index = 0
  214. while metadata = FIND_YAML_LINES.match(yaml, index) do
  215. index = metadata.end(0)
  216. result[:metadata].push [
  217. process_string(metadata[1]), process_string(metadata[2])
  218. ]
  219. end
  220. return result
  221. end
  222. end