From 9a468c895be2b9970a9556e72a9162070337b87c Mon Sep 17 00:00:00 2001 From: Claire Date: Thu, 15 Jul 2021 15:56:58 +0200 Subject: [PATCH] Fix inefficiencies in auto-linking code (#16506) The auto-linking code basically rewrote the whole string escaping non-ascii characters in an inefficient way, and building a full character offset map between the unescaped and escaped texts before sending the contents to TwitterText's extractor. Instead of doing that, this commit changes the TwitterText regexps to include valid IRI characters in addition to valid URI characters. --- app/lib/formatter.rb | 31 +--------------------------- config/initializers/twitter_regex.rb | 4 ++++ 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index fd6537526..1610f3689 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -214,39 +214,10 @@ class Formatter result.flatten.join end - UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/ - def utf8_friendly_extractor(text, options = {}) - old_to_new_index = [0] - - escaped = text.chars.map do |c| - output = begin - if c.ord.to_s(16).length > 2 && !UNICODE_ESCAPE_BLACKLIST_RE.match?(c) - CGI.escape(c) - else - c - end - end - - old_to_new_index << old_to_new_index.last + output.length - - output - end.join - # Note: I couldn't obtain list_slug with @user/list-name format # for mention so this requires additional check - special = Extractor.extract_urls_with_indices(escaped, options).map do |extract| - new_indices = [ - old_to_new_index.find_index(extract[:indices].first), - old_to_new_index.find_index(extract[:indices].last), - ] - - next extract.merge( - indices: new_indices, - url: text[new_indices.first..new_indices.last - 1] - ) - end - + special = Extractor.extract_urls_with_indices(text, options) standard = Extractor.extract_entities_with_indices(text, options) extra = Extractor.extract_extra_uris_with_indices(text, options) diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb index 3ff2aa9e5..84c09ff35 100644 --- a/config/initializers/twitter_regex.rb +++ b/config/initializers/twitter_regex.rb @@ -24,6 +24,10 @@ module Twitter::TwitterText ) \) /iox + REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou + REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou + REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou + REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou REGEXEN[:valid_url_path] = /(?: (?: #{REGEXEN[:valid_general_url_path_chars]}*