You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
1.2 KiB

  1. # frozen_string_literal: true
  2. class LanguageDetector
  3. include Singleton
  4. def initialize
  5. @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
  6. end
  7. def detect(text, account)
  8. detect_language_code(text) || default_locale(account)
  9. end
  10. def language_names
  11. @language_names =
  12. CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
  13. .uniq
  14. end
  15. private
  16. def prepare_text(text)
  17. simplify_text(text).strip
  18. end
  19. def detect_language_code(text)
  20. result = @identifier.find_language(prepare_text(text))
  21. iso6391(result.language.to_s).to_sym if result.reliable?
  22. end
  23. def iso6391(bcp47)
  24. iso639 = bcp47.split('-').first
  25. # CLD3 returns grandfathered language code for Hebrew
  26. return 'he' if iso639 == 'iw'
  27. ISO_639.find(iso639).alpha2
  28. end
  29. def simplify_text(text)
  30. text.dup.tap do |new_text|
  31. new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
  32. new_text.gsub!(Account::MENTION_RE, '')
  33. new_text.gsub!(Tag::HASHTAG_RE, '')
  34. new_text.gsub!(/\s+/, ' ')
  35. end
  36. end
  37. def default_locale(account)
  38. account.user_locale&.to_sym
  39. end
  40. end