闭社主体 forked from https://github.com/tootsuite/mastodon
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
1.6 KiB

  1. # frozen_string_literal: true
  2. class LanguageDetector
  3. include Singleton
  4. def initialize
  5. @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
  6. end
  7. def detect(text, account)
  8. detect_language_code(text) || default_locale(account)
  9. end
  10. def language_names
  11. @language_names =
  12. CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
  13. .uniq
  14. end
  15. private
  16. def prepare_text(text)
  17. simplify_text(text).strip
  18. end
  19. def detect_language_code(text)
  20. result = @identifier.find_language(prepare_text(text))
  21. iso6391(result.language.to_s).to_sym if result.reliable?
  22. end
  23. def iso6391(bcp47)
  24. iso639 = bcp47.split('-').first
  25. # CLD3 returns grandfathered language code for Hebrew
  26. return 'he' if iso639 == 'iw'
  27. ISO_639.find(iso639).alpha2
  28. end
  29. def simplify_text(text)
  30. new_text = remove_html(text)
  31. new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
  32. new_text.gsub!(Account::MENTION_RE, '')
  33. new_text.gsub!(Tag::HASHTAG_RE, '')
  34. new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
  35. new_text.gsub!(/\s+/, ' ')
  36. new_text
  37. end
  38. def new_scrubber
  39. scrubber = Rails::Html::PermitScrubber.new
  40. scrubber.tags = %w(br p)
  41. scrubber
  42. end
  43. def scrubber
  44. @scrubber ||= new_scrubber
  45. end
  46. def remove_html(text)
  47. text = Loofah.fragment(text).scrub!(scrubber).to_s
  48. text.gsub!('<br>', "\n")
  49. text.gsub!('</p><p>', "\n\n")
  50. text.gsub!(/(^<p>|<\/p>$)/, '')
  51. text
  52. end
  53. def default_locale(account)
  54. account.user_locale&.to_sym
  55. end
  56. end