You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
1.8 KiB

  1. # frozen_string_literal: true
  2. class LanguageDetector
  3. include Singleton
  4. CHARACTER_THRESHOLD = 140
  5. def initialize
  6. @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
  7. end
  8. def detect(text, account)
  9. input_text = prepare_text(text)
  10. return if input_text.blank?
  11. detect_language_code(input_text) || default_locale(account)
  12. end
  13. def language_names
  14. @language_names =
  15. CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
  16. .uniq
  17. end
  18. private
  19. def prepare_text(text)
  20. simplify_text(text).strip
  21. end
  22. def unreliable_input?(text)
  23. text.size < CHARACTER_THRESHOLD
  24. end
  25. def detect_language_code(text)
  26. return if unreliable_input?(text)
  27. result = @identifier.find_language(text)
  28. iso6391(result.language.to_s).to_sym if result.reliable?
  29. end
  30. def iso6391(bcp47)
  31. iso639 = bcp47.split('-').first
  32. # CLD3 returns grandfathered language code for Hebrew
  33. return 'he' if iso639 == 'iw'
  34. ISO_639.find(iso639).alpha2
  35. end
  36. def simplify_text(text)
  37. new_text = remove_html(text)
  38. new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
  39. new_text.gsub!(Account::MENTION_RE, '')
  40. new_text.gsub!(Tag::HASHTAG_RE, '')
  41. new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
  42. new_text.gsub!(/\s+/, ' ')
  43. new_text
  44. end
  45. def new_scrubber
  46. scrubber = Rails::Html::PermitScrubber.new
  47. scrubber.tags = %w(br p)
  48. scrubber
  49. end
  50. def scrubber
  51. @scrubber ||= new_scrubber
  52. end
  53. def remove_html(text)
  54. text = Loofah.fragment(text).scrub!(scrubber).to_s
  55. text.gsub!('<br>', "\n")
  56. text.gsub!('</p><p>', "\n\n")
  57. text.gsub!(/(^<p>|<\/p>$)/, '')
  58. text
  59. end
  60. def default_locale(account)
  61. account.user_locale&.to_sym || I18n.default_locale
  62. end
  63. end