You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

410 lines
14 KiB

  1. /*
  2. `util/bio_metadata`
  3. ===================
  4. > For more information on the contents of this file, please contact:
  5. >
  6. > - kibigo! [@kibi@glitch.social]
  7. This file provides two functions for dealing with bio metadata. The
  8. functions are:
  9. - __`processBio(content)` :__
  10. Processes `content` to extract any frontmatter. The returned
  11. object has two properties: `text`, which contains the text of
  12. `content` sans-frontmatter, and `metadata`, which is an array
  13. of key-value pairs (in two-element array format). If no
  14. frontmatter was provided in `content`, then `metadata` will be
  15. an empty array.
  16. - __`createBio(note, data)` :__
  17. Reverses the process in `processBio()`; takes a `note` and an
  18. array of two-element arrays (which should give keys and values)
  19. and outputs a string containing a well-formed bio with
  20. frontmatter.
  21. */
  22. // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  23. /*********************************************************************\
  24. To my lovely code maintainers,
  25. The syntax recognized by the Mastodon frontend for its bio metadata
  26. feature is a subset of that provided by the YAML 1.2 specification.
  27. In particular, Mastodon recognizes metadata which is provided as an
  28. implicit YAML map, where each key-value pair takes up only a single
  29. line (no multi-line values are permitted). To simplify the level of
  30. processing required, Mastodon metadata frontmatter has been limited
  31. to only allow those characters in the `c-printable` set, as defined
  32. by the YAML 1.2 specification, instead of permitting those from the
  33. `nb-json` characters inside double-quoted strings like YAML proper.
  34. It is important to note that Mastodon only borrows the *syntax*
  35. of YAML, not its semantics. This is to say, Mastodon won't make any
  36. attempt to interpret the data it receives. `true` will not become a
  37. boolean; `56` will not be interpreted as a number. Rather, each key
  38. and every value will be read as a string, and as a string they will
  39. remain. The order of the pairs is unchanged, and any duplicate keys
  40. are preserved. However, YAML escape sequences will be replaced with
  41. the proper interpretations according to the YAML 1.2 specification.
  42. The implementation provided below interprets `<br>` as `\n` and
  43. allows for an open <p> tag at the beginning of the bio. It replaces
  44. the escaped character entities `&apos;` and `&quot;` with single or
  45. double quotes, respectively, prior to processing. However, no other
  46. escaped characters are replaced, not even those which might have an
  47. impact on the syntax otherwise. These minor allowances are provided
  48. because the Mastodon backend will insert these things automatically
  49. into a bio before sending it through the API, so it is important we
  50. account for them. Aside from this, the YAML frontmatter must be the
  51. very first thing in the bio, leading with three consecutive hyphen-
  52. minues (`---`), and ending with the same or, alternatively, instead
  53. with three periods (`...`). No limits have been set with respect to
  54. the number of characters permitted in the frontmatter, although one
  55. should note that only limited space is provided for them in the UI.
  56. The regular expression used to check the existence of, and then
  57. process, the YAML frontmatter has been split into a number of small
  58. components in the code below, in the vain hope that it will be much
  59. easier to read and to maintain. I leave it to the future readers of
  60. this code to determine the extent of my successes in this endeavor.
  61. Sending love + warmth eternal,
  62. - kibigo [@kibi@glitch.social]
  63. \*********************************************************************/
  64. /* CONVENIENCE FUNCTIONS */
  65. const unirex = str => new RegExp(str, 'u');
  66. const rexstr = exp => '(?:' + exp.source + ')';
  67. /* CHARACTER CLASSES */
  68. const DOCUMENT_START = /^/;
  69. const DOCUMENT_END = /$/;
  70. const ALLOWED_CHAR = // `c-printable` in the YAML 1.2 spec.
  71. /[\t\n\r\x20-\x7e\x85\xa0-\ud7ff\ue000-\ufffd\u{10000}-\u{10FFFF}]/u;
  72. const WHITE_SPACE = /[ \t]/;
  73. const INDENTATION = / */; // Indentation must be only spaces.
  74. const LINE_BREAK = /\r?\n|\r|<br\s*\/?>/;
  75. const ESCAPE_CHAR = /[0abt\tnvfre "\/\\N_LP]/;
  76. const HEXADECIMAL_CHARS = /[0-9a-fA-F]/;
  77. const INDICATOR = /[-?:,[\]{}&#*!|>'"%@`]/;
  78. const FLOW_CHAR = /[,[\]{}]/;
  79. /* NEGATED CHARACTER CLASSES */
  80. const NOT_WHITE_SPACE = unirex('(?!' + rexstr(WHITE_SPACE) + ')[^]');
  81. const NOT_LINE_BREAK = unirex('(?!' + rexstr(LINE_BREAK) + ')[^]');
  82. const NOT_INDICATOR = unirex('(?!' + rexstr(INDICATOR) + ')[^]');
  83. const NOT_FLOW_CHAR = unirex('(?!' + rexstr(FLOW_CHAR) + ')[^]');
  84. const NOT_ALLOWED_CHAR = unirex(
  85. '(?!' + rexstr(ALLOWED_CHAR) + ')[^]'
  86. );
  87. /* BASIC CONSTRUCTS */
  88. const ANY_WHITE_SPACE = unirex(rexstr(WHITE_SPACE) + '*');
  89. const ANY_ALLOWED_CHARS = unirex(rexstr(ALLOWED_CHAR) + '*');
  90. const NEW_LINE = unirex(
  91. rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK)
  92. );
  93. const SOME_NEW_LINES = unirex(
  94. '(?:' + rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK) + ')+'
  95. );
  96. const POSSIBLE_STARTS = unirex(
  97. rexstr(DOCUMENT_START) + rexstr(/<p[^<>]*>/) + '?'
  98. );
  99. const POSSIBLE_ENDS = unirex(
  100. rexstr(SOME_NEW_LINES) + '|' +
  101. rexstr(DOCUMENT_END) + '|' +
  102. rexstr(/<\/p>/)
  103. );
  104. const CHARACTER_ESCAPE = unirex(
  105. rexstr(/\\/) +
  106. '(?:' +
  107. rexstr(ESCAPE_CHAR) + '|' +
  108. rexstr(/x/) + rexstr(HEXADECIMAL_CHARS) + '{2}' + '|' +
  109. rexstr(/u/) + rexstr(HEXADECIMAL_CHARS) + '{4}' + '|' +
  110. rexstr(/U/) + rexstr(HEXADECIMAL_CHARS) + '{8}' +
  111. ')'
  112. );
  113. const ESCAPED_CHAR = unirex(
  114. rexstr(/(?!["\\])/) + rexstr(NOT_LINE_BREAK) + '|' +
  115. rexstr(CHARACTER_ESCAPE)
  116. );
  117. const ANY_ESCAPED_CHARS = unirex(
  118. rexstr(ESCAPED_CHAR) + '*'
  119. );
  120. const ESCAPED_APOS = unirex(
  121. '(?=' + rexstr(NOT_LINE_BREAK) + ')' + rexstr(/[^']|''/)
  122. );
  123. const ANY_ESCAPED_APOS = unirex(
  124. rexstr(ESCAPED_APOS) + '*'
  125. );
  126. const FIRST_KEY_CHAR = unirex(
  127. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  128. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  129. rexstr(NOT_INDICATOR) + '|' +
  130. rexstr(/[?:-]/) +
  131. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  132. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  133. '(?=' + rexstr(NOT_FLOW_CHAR) + ')'
  134. );
  135. const FIRST_VALUE_CHAR = unirex(
  136. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  137. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  138. rexstr(NOT_INDICATOR) + '|' +
  139. rexstr(/[?:-]/) +
  140. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  141. '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  142. // Flow indicators are allowed in values.
  143. );
  144. const LATER_KEY_CHAR = unirex(
  145. rexstr(WHITE_SPACE) + '|' +
  146. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  147. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  148. '(?=' + rexstr(NOT_FLOW_CHAR) + ')' +
  149. rexstr(/[^:#]#?/) + '|' +
  150. rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  151. );
  152. const LATER_VALUE_CHAR = unirex(
  153. rexstr(WHITE_SPACE) + '|' +
  154. '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
  155. '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
  156. // Flow indicators are allowed in values.
  157. rexstr(/[^:#]#?/) + '|' +
  158. rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
  159. );
  160. /* YAML CONSTRUCTS */
  161. const YAML_START = unirex(
  162. rexstr(ANY_WHITE_SPACE) + rexstr(/---/)
  163. );
  164. const YAML_END = unirex(
  165. rexstr(ANY_WHITE_SPACE) + rexstr(/(?:---|\.\.\.)/)
  166. );
  167. const YAML_LOOKAHEAD = unirex(
  168. '(?=' +
  169. rexstr(YAML_START) +
  170. rexstr(ANY_ALLOWED_CHARS) + rexstr(NEW_LINE) +
  171. rexstr(YAML_END) + rexstr(POSSIBLE_ENDS) +
  172. ')'
  173. );
  174. const YAML_DOUBLE_QUOTE = unirex(
  175. rexstr(/"/) + rexstr(ANY_ESCAPED_CHARS) + rexstr(/"/)
  176. );
  177. const YAML_SINGLE_QUOTE = unirex(
  178. rexstr(/'/) + rexstr(ANY_ESCAPED_APOS) + rexstr(/'/)
  179. );
  180. const YAML_SIMPLE_KEY = unirex(
  181. rexstr(FIRST_KEY_CHAR) + rexstr(LATER_KEY_CHAR) + '*'
  182. );
  183. const YAML_SIMPLE_VALUE = unirex(
  184. rexstr(FIRST_VALUE_CHAR) + rexstr(LATER_VALUE_CHAR) + '*'
  185. );
  186. const YAML_KEY = unirex(
  187. rexstr(YAML_DOUBLE_QUOTE) + '|' +
  188. rexstr(YAML_SINGLE_QUOTE) + '|' +
  189. rexstr(YAML_SIMPLE_KEY)
  190. );
  191. const YAML_VALUE = unirex(
  192. rexstr(YAML_DOUBLE_QUOTE) + '|' +
  193. rexstr(YAML_SINGLE_QUOTE) + '|' +
  194. rexstr(YAML_SIMPLE_VALUE)
  195. );
  196. const YAML_SEPARATOR = unirex(
  197. rexstr(ANY_WHITE_SPACE) +
  198. ':' + rexstr(WHITE_SPACE) +
  199. rexstr(ANY_WHITE_SPACE)
  200. );
  201. const YAML_LINE = unirex(
  202. '(' + rexstr(YAML_KEY) + ')' +
  203. rexstr(YAML_SEPARATOR) +
  204. '(' + rexstr(YAML_VALUE) + ')'
  205. );
  206. /* FRONTMATTER REGEX */
  207. const YAML_FRONTMATTER = unirex(
  208. rexstr(POSSIBLE_STARTS) +
  209. rexstr(YAML_LOOKAHEAD) +
  210. rexstr(YAML_START) + rexstr(SOME_NEW_LINES) +
  211. '(?:' +
  212. '(' + rexstr(INDENTATION) + ')' +
  213. rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
  214. '(?:' +
  215. '\\1' + rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
  216. '){0,4}' +
  217. ')?' +
  218. rexstr(YAML_END) + rexstr(POSSIBLE_ENDS)
  219. );
  220. /* SEARCHES */
  221. const FIND_YAML_LINES = unirex(
  222. rexstr(NEW_LINE) + rexstr(INDENTATION) + rexstr(YAML_LINE)
  223. );
  224. /* STRING PROCESSING */
  225. function processString(str) {
  226. switch (str.charAt(0)) {
  227. case '"':
  228. return str
  229. .substring(1, str.length - 1)
  230. .replace(/\\0/g, '\x00')
  231. .replace(/\\a/g, '\x07')
  232. .replace(/\\b/g, '\x08')
  233. .replace(/\\t/g, '\x09')
  234. .replace(/\\\x09/g, '\x09')
  235. .replace(/\\n/g, '\x0a')
  236. .replace(/\\v/g, '\x0b')
  237. .replace(/\\f/g, '\x0c')
  238. .replace(/\\r/g, '\x0d')
  239. .replace(/\\e/g, '\x1b')
  240. .replace(/\\ /g, '\x20')
  241. .replace(/\\"/g, '\x22')
  242. .replace(/\\\//g, '\x2f')
  243. .replace(/\\\\/g, '\x5c')
  244. .replace(/\\N/g, '\x85')
  245. .replace(/\\_/g, '\xa0')
  246. .replace(/\\L/g, '\u2028')
  247. .replace(/\\P/g, '\u2029')
  248. .replace(
  249. new RegExp(
  250. unirex(
  251. rexstr(/\\x/) + '(' + rexstr(HEXADECIMAL_CHARS) + '{2})'
  252. ), 'gu'
  253. ), (_, n) => String.fromCodePoint('0x' + n)
  254. )
  255. .replace(
  256. new RegExp(
  257. unirex(
  258. rexstr(/\\u/) + '(' + rexstr(HEXADECIMAL_CHARS) + '{4})'
  259. ), 'gu'
  260. ), (_, n) => String.fromCodePoint('0x' + n)
  261. )
  262. .replace(
  263. new RegExp(
  264. unirex(
  265. rexstr(/\\U/) + '(' + rexstr(HEXADECIMAL_CHARS) + '{8})'
  266. ), 'gu'
  267. ), (_, n) => String.fromCodePoint('0x' + n)
  268. );
  269. case '\'':
  270. return str
  271. .substring(1, str.length - 1)
  272. .replace(/''/g, '\'');
  273. default:
  274. return str;
  275. }
  276. }
  277. /* BIO PROCESSING */
  278. export function processBio(content) {
  279. content = content.replace(/&quot;/g, '"').replace(/&apos;/g, '\'');
  280. let result = {
  281. text: content,
  282. metadata: [],
  283. };
  284. let yaml = content.match(YAML_FRONTMATTER);
  285. if (!yaml) return result;
  286. else yaml = yaml[0];
  287. let start = content.search(YAML_START);
  288. let end = start + yaml.length - yaml.search(YAML_START);
  289. result.text = content.substr(0, start) + content.substr(end);
  290. let metadata = null;
  291. let query = new RegExp(FIND_YAML_LINES, 'g');
  292. while ((metadata = query.exec(yaml))) {
  293. result.metadata.push([
  294. processString(metadata[1]),
  295. processString(metadata[2]),
  296. ]);
  297. }
  298. return result;
  299. }
  300. /* BIO CREATION */
  301. export function createBio(note, data) {
  302. if (!note) note = '';
  303. let frontmatter = '';
  304. if ((data && data.length) || note.match(/^\s*---\s+/)) {
  305. if (!data) frontmatter = '---\n...\n';
  306. else {
  307. frontmatter += '---\n';
  308. for (let i = 0; i < data.length; i++) {
  309. let key = '' + data[i][0];
  310. let val = '' + data[i][1];
  311. // Key processing
  312. if (key === (key.match(YAML_SIMPLE_KEY) || [])[0]) /* do nothing */;
  313. else if (key.indexOf('\'') === -1 && key === (key.match(ANY_ESCAPED_APOS) || [])[0]) key = '\'' + key + '\'';
  314. else {
  315. key = key
  316. .replace(/\x00/g, '\\0')
  317. .replace(/\x07/g, '\\a')
  318. .replace(/\x08/g, '\\b')
  319. .replace(/\x0a/g, '\\n')
  320. .replace(/\x0b/g, '\\v')
  321. .replace(/\x0c/g, '\\f')
  322. .replace(/\x0d/g, '\\r')
  323. .replace(/\x1b/g, '\\e')
  324. .replace(/\x22/g, '\\"')
  325. .replace(/\x5c/g, '\\\\');
  326. let badchars = key.match(
  327. new RegExp(rexstr(NOT_ALLOWED_CHAR), 'gu')
  328. ) || [];
  329. for (let j = 0; j < badchars.length; j++) {
  330. key = key.replace(
  331. badchars[i],
  332. '\\u' + badchars[i].codePointAt(0).toLocaleString('en', {
  333. useGrouping: false,
  334. minimumIntegerDigits: 4,
  335. })
  336. );
  337. }
  338. key = '"' + key + '"';
  339. }
  340. // Value processing
  341. if (val === (val.match(YAML_SIMPLE_VALUE) || [])[0]) /* do nothing */;
  342. else if (val.indexOf('\'') === -1 && val === (val.match(ANY_ESCAPED_APOS) || [])[0]) val = '\'' + val + '\'';
  343. else {
  344. val = val
  345. .replace(/\x00/g, '\\0')
  346. .replace(/\x07/g, '\\a')
  347. .replace(/\x08/g, '\\b')
  348. .replace(/\x0a/g, '\\n')
  349. .replace(/\x0b/g, '\\v')
  350. .replace(/\x0c/g, '\\f')
  351. .replace(/\x0d/g, '\\r')
  352. .replace(/\x1b/g, '\\e')
  353. .replace(/\x22/g, '\\"')
  354. .replace(/\x5c/g, '\\\\');
  355. let badchars = val.match(
  356. new RegExp(rexstr(NOT_ALLOWED_CHAR), 'gu')
  357. ) || [];
  358. for (let j = 0; j < badchars.length; j++) {
  359. val = val.replace(
  360. badchars[i],
  361. '\\u' + badchars[i].codePointAt(0).toLocaleString('en', {
  362. useGrouping: false,
  363. minimumIntegerDigits: 4,
  364. })
  365. );
  366. }
  367. val = '"' + val + '"';
  368. }
  369. frontmatter += key + ': ' + val + '\n';
  370. }
  371. frontmatter += '...\n';
  372. }
  373. }
  374. return frontmatter + note;
  375. }