You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
5.7 KiB

  1. # frozen_string_literal: true
  2. require_relative '../../config/boot'
  3. require_relative '../../config/environment'
  4. require_relative 'cli_helper'
  5. module Mastodon
  6. class SearchCLI < Thor
  7. include CLIHelper
  8. # Indices are sorted by amount of data to be expected in each, so that
  9. # smaller indices can go online sooner
  10. INDICES = [
  11. AccountsIndex,
  12. TagsIndex,
  13. StatusesIndex,
  14. ].freeze
  15. option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
  16. option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
  17. desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'
  18. long_desc <<~LONG_DESC
  19. If ElasticSearch is empty, this command will create the necessary indices
  20. and then import data from the database into those indices.
  21. This command will also upgrade indices if the underlying schema has been
  22. changed since the last run.
  23. Even if creating or upgrading indices is not necessary, data from the
  24. database will be imported into the indices.
  25. LONG_DESC
  26. def deploy
  27. if options[:concurrency] < 1
  28. say('Cannot run with this concurrency setting, must be at least 1', :red)
  29. exit(1)
  30. end
  31. indices = begin
  32. if options[:only]
  33. options[:only].map { |str| "#{str.camelize}Index".constantize }
  34. else
  35. INDICES
  36. end
  37. end
  38. progress = ProgressBar.create(total: nil, format: '%t%c/%u |%b%i| %e (%r docs/s)', autofinish: false)
  39. # First, ensure all indices are created and have the correct
  40. # structure, so that live data can already be written
  41. indices.select { |index| index.specification.changed? }.each do |index|
  42. progress.title = "Upgrading #{index} "
  43. index.purge
  44. index.specification.lock!
  45. end
  46. db_config = ActiveRecord::Base.configurations[Rails.env].dup
  47. db_config['pool'] = options[:concurrency] + 1
  48. ActiveRecord::Base.establish_connection(db_config)
  49. pool = Concurrent::FixedThreadPool.new(options[:concurrency])
  50. added = Concurrent::AtomicFixnum.new(0)
  51. removed = Concurrent::AtomicFixnum.new(0)
  52. progress.title = 'Estimating workload '
  53. # Estimate the amount of data that has to be imported first
  54. indices.each do |index|
  55. index.types.each do |type|
  56. progress.total = (progress.total || 0) + type.adapter.default_scope.count
  57. end
  58. end
  59. # Now import all the actual data. Mind that unlike chewy:sync, we don't
  60. # fetch and compare all record IDs from the database and the index to
  61. # find out which to add and which to remove from the index. Because with
  62. # potentially millions of rows, the memory footprint of such a calculation
  63. # is uneconomical. So we only ever add.
  64. indices.each do |index|
  65. progress.title = "Importing #{index} "
  66. batch_size = 1_000
  67. slice_size = (batch_size / options[:concurrency]).ceil
  68. index.types.each do |type|
  69. type.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|
  70. futures = []
  71. batch.each_slice(slice_size) do |records|
  72. futures << Concurrent::Future.execute(executor: pool) do
  73. begin
  74. if !progress.total.nil? && progress.progress + records.size > progress.total
  75. # The number of items has changed between start and now,
  76. # since there is no good way to predict the final count from
  77. # here, just change the progress bar to an indeterminate one
  78. progress.total = nil
  79. end
  80. grouped_records = nil
  81. bulk_body = nil
  82. index_count = 0
  83. delete_count = 0
  84. ActiveRecord::Base.connection_pool.with_connection do
  85. grouped_records = type.adapter.send(:grouped_objects, records)
  86. bulk_body = Chewy::Type::Import::BulkBuilder.new(type, **grouped_records).bulk_body
  87. end
  88. index_count = grouped_records[:index].size if grouped_records.key?(:index)
  89. delete_count = grouped_records[:delete].size if grouped_records.key?(:delete)
  90. # The following is an optimization for statuses specifically, since
  91. # we want to de-index statuses that cannot be searched by anybody,
  92. # but can't use Chewy's delete_if logic because it doesn't use
  93. # crutches and our searchable_by logic depends on them
  94. if type == StatusesIndex::Status
  95. bulk_body.map! do |entry|
  96. if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank?
  97. index_count -= 1
  98. delete_count += 1
  99. { delete: entry[:index].except(:data) }
  100. else
  101. entry
  102. end
  103. end
  104. end
  105. Chewy::Type::Import::BulkRequest.new(type).perform(bulk_body)
  106. progress.progress += records.size
  107. added.increment(index_count)
  108. removed.increment(delete_count)
  109. sleep 1
  110. rescue => e
  111. progress.log pastel.red("Error importing #{index}: #{e}")
  112. end
  113. end
  114. end
  115. futures.map(&:value)
  116. end
  117. end
  118. end
  119. progress.title = ''
  120. progress.stop
  121. say("Indexed #{added.value} records, de-indexed #{removed.value}", :green, true)
  122. end
  123. end
  124. end