Change algorithm of `tootctl search deploy` to improve performance (#18463)

2 years ago · a9b64b24d6
--- a/app/chewy/accounts_index.rb
+++ b/app/chewy/accounts_index.rb
@ -23,7 +23,7 @@ class AccountsIndex < Chewy::Index
    },
  }

  index_scope ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? }
  index_scope ::Account.searchable.includes(:account_stat)

  root date_detection: false do
    field :id, type: 'long'
@ -36,8 +36,8 @@ class AccountsIndex < Chewy::Index
      field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
    end

    field :following_count, type: 'long', value: ->(account) { account.following.local.count }
    field :followers_count, type: 'long', value: ->(account) { account.followers.local.count }
    field :following_count, type: 'long', value: ->(account) { account.following_count }
    field :followers_count, type: 'long', value: ->(account) { account.followers_count }
    field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at }
  end
 end
--- a/app/chewy/statuses_index.rb
+++ b/app/chewy/statuses_index.rb
@ -33,6 +33,8 @@ class StatusesIndex < Chewy::Index
    },
  }

  # We do not use delete_if option here because it would call a method that we
  # expect to be called with crutches without crutches, causing n+1 queries
  index_scope ::Status.unscoped.kept.without_reblogs.includes(:media_attachments, :preloadable_poll)

  crutch :mentions do |collection|
--- a/app/chewy/tags_index.rb
+++ b/app/chewy/tags_index.rb
@ -23,7 +23,11 @@ class TagsIndex < Chewy::Index
    },
  }

  index_scope ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? }
  index_scope ::Tag.listable

  crutch :time_period do
    7.days.ago.to_date..0.days.ago.to_date
  end

  root date_detection: false do
    field :name, type: 'text', analyzer: 'content' do
@ -31,7 +35,7 @@ class TagsIndex < Chewy::Index
    end

    field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
    field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day.accounts } }
    field :usage, type: 'long', value: ->(tag, crutches) { tag.history.aggregate(crutches.time_period).accounts }
    field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
  end
 end
--- a/app/lib/importer/accounts_index_importer.rb
+++ b/app/lib/importer/accounts_index_importer.rb
@ -0,0 +1,30 @@
 # frozen_string_literal: true

 class Importer::AccountsIndexImporter < Importer::BaseImporter
  def import!
    scope.includes(:account_stat).find_in_batches(batch_size: @batch_size) do |tmp|
      in_work_unit(tmp) do |accounts|
        bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: accounts).bulk_body

        indexed = bulk.select { |entry| entry[:index] }.size
        deleted = bulk.select { |entry| entry[:delete] }.size

        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

        [indexed, deleted]
      end
    end

    wait!
  end

  private

  def index
    AccountsIndex
  end

  def scope
    Account.searchable
  end
 end
--- a/app/lib/importer/base_importer.rb
+++ b/app/lib/importer/base_importer.rb
@ -0,0 +1,87 @@
 # frozen_string_literal: true

 class Importer::BaseImporter
  # @param [Integer] batch_size
  # @param [Concurrent::ThreadPoolExecutor] executor
  def initialize(batch_size:, executor:)
    @batch_size = batch_size
    @executor   = executor
    @wait_for   = Concurrent::Set.new
  end

  # Callback to run when a concurrent work unit completes
  # @param [Proc]
  def on_progress(&block)
    @on_progress = block
  end

  # Callback to run when a concurrent work unit fails
  # @param [Proc]
  def on_failure(&block)
    @on_failure = block
  end

  # Reduce resource usage during and improve speed of indexing
  def optimize_for_import!
    Chewy.client.indices.put_settings index: index.index_name, body: { index: { refresh_interval: -1 } }
  end

  # Restore original index settings
  def optimize_for_search!
    Chewy.client.indices.put_settings index: index.index_name, body: { index: { refresh_interval: index.settings_hash[:settings][:index][:refresh_interval] } }
  end

  # Estimate the amount of documents that would be indexed. Not exact!
  # @returns [Integer]
  def estimate!
    ActiveRecord::Base.connection_pool.with_connection { |connection| connection.select_one("SELECT reltuples AS estimate FROM pg_class WHERE relname = '#{index.adapter.target.table_name}'")['estimate'].to_i }
  end

  # Import data from the database into the index
  def import!
    raise NotImplementedError
  end

  # Remove documents from the index that no longer exist in the database
  def clean_up!
    index.scroll_batches do |documents|
      ids           = documents.map { |doc| doc['_id'] }
      existence_map = index.adapter.target.where(id: ids).pluck(:id).each_with_object({}) { |id, map| map[id.to_s] = true }
      tmp           = ids.reject { |id| existence_map[id] }

      next if tmp.empty?

      in_work_unit(tmp) do |deleted_ids|
        bulk = Chewy::Index::Import::BulkBuilder.new(index, delete: deleted_ids).bulk_body

        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

        [0, bulk.size]
      end
    end

    wait!
  end

  protected

  def in_work_unit(*args, &block)
    work_unit = Concurrent::Promises.future_on(@executor, *args, &block)

    work_unit.on_fulfillment!(&@on_progress)
    work_unit.on_rejection!(&@on_failure)
    work_unit.on_resolution! { @wait_for.delete(work_unit) }

    @wait_for << work_unit
  rescue Concurrent::RejectedExecutionError
    sleep(0.1) && retry # Backpressure
  end

  def wait!
    Concurrent::Promises.zip(*@wait_for).wait
  end

  def index
    raise NotImplementedError
  end
 end
--- a/app/lib/importer/statuses_index_importer.rb
+++ b/app/lib/importer/statuses_index_importer.rb
@ -0,0 +1,89 @@
 # frozen_string_literal: true

 class Importer::StatusesIndexImporter < Importer::BaseImporter
  def import!
    # The idea is that instead of iterating over all statuses in the database
    # and calculating the searchable_by for each of them (majority of which
    # would be empty), we approach the index from the other end

    scopes.each do |scope|
      # We could be tempted to keep track of status IDs we have already processed
      # from a different scope to avoid indexing them multiple times, but that
      # could end up being a very large array

      scope.find_in_batches(batch_size: @batch_size) do |tmp|
        in_work_unit(tmp.map(&:status_id)) do |status_ids|
          bulk = ActiveRecord::Base.connection_pool.with_connection do
            Chewy::Index::Import::BulkBuilder.new(index, to_index: Status.includes(:media_attachments, :preloadable_poll).where(id: status_ids)).bulk_body
          end

          indexed = 0
          deleted = 0

          # We can't use the delete_if proc to do the filtering because delete_if
          # is called before rendering the data and we need to filter based
          # on the results of the filter, so this filtering happens here instead
          bulk.map! do |entry|
            new_entry = begin
              if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank?
                { delete: entry[:index].except(:data) }
              else
                entry
              end
            end

            if new_entry[:index]
              indexed += 1
            else
              deleted += 1
            end

            new_entry
          end

          Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

          [indexed, deleted]
        end
      end
    end

    wait!
  end

  private

  def index
    StatusesIndex
  end

  def scopes
    [
      local_statuses_scope,
      local_mentions_scope,
      local_favourites_scope,
      local_votes_scope,
      local_bookmarks_scope,
    ]
  end

  def local_mentions_scope
    Mention.where(account: Account.local, silent: false).select(:id, :status_id)
  end

  def local_favourites_scope
    Favourite.where(account: Account.local).select(:id, :status_id)
  end

  def local_bookmarks_scope
    Bookmark.select(:id, :status_id)
  end

  def local_votes_scope
    Poll.joins(:votes).where(votes: { account: Account.local }).select('polls.id, polls.status_id')
  end

  def local_statuses_scope
    Status.local.select('id, coalesce(reblog_of_id, id) as status_id')
  end
 end
--- a/app/lib/importer/tags_index_importer.rb
+++ b/app/lib/importer/tags_index_importer.rb
@ -0,0 +1,26 @@
 # frozen_string_literal: true

 class Importer::TagsIndexImporter < Importer::BaseImporter
  def import!
    index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp|
      in_work_unit(tmp) do |tags|
        bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: tags).bulk_body

        indexed = bulk.select { |entry| entry[:index] }.size
        deleted = bulk.select { |entry| entry[:delete] }.size

        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

        [indexed, deleted]
      end
    end

    wait!
  end

  private

  def index
    TagsIndex
  end
 end
--- a/app/models/trends/history.rb
+++ b/app/models/trends/history.rb
@ -11,11 +11,11 @@ class Trends::History
    end

    def uses
      redis.mget(*@days.map { |day| day.key_for(:uses) }).map(&:to_i).sum
      with_redis { |redis| redis.mget(*@days.map { |day| day.key_for(:uses) }).map(&:to_i).sum }
    end

    def accounts
      redis.pfcount(*@days.map { |day| day.key_for(:accounts) })
      with_redis { |redis| redis.pfcount(*@days.map { |day| day.key_for(:accounts) }) }
    end
  end

@ -33,19 +33,21 @@ class Trends::History
    attr_reader :day

    def accounts
      redis.pfcount(key_for(:accounts))
      with_redis { |redis| redis.pfcount(key_for(:accounts)) }
    end

    def uses
      redis.get(key_for(:uses))&.to_i || 0
      with_redis { |redis| redis.get(key_for(:uses))&.to_i || 0 }
    end

    def add(account_id)
      redis.pipelined do
        redis.incrby(key_for(:uses), 1)
        redis.pfadd(key_for(:accounts), account_id)
        redis.expire(key_for(:uses), EXPIRE_AFTER)
        redis.expire(key_for(:accounts), EXPIRE_AFTER)
      with_redis do |redis|
        redis.pipelined do |pipeline|
          pipeline.incrby(key_for(:uses), 1)
          pipeline.pfadd(key_for(:accounts), account_id)
          pipeline.expire(key_for(:uses), EXPIRE_AFTER)
          pipeline.expire(key_for(:accounts), EXPIRE_AFTER)
        end
      end
    end

--- a/lib/mastodon/search_cli.rb
+++ b/lib/mastodon/search_cli.rb
@ -16,19 +16,21 @@ module Mastodon
      StatusesIndex,
    ].freeze

    option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
    option :concurrency, type: :numeric, default: 5, aliases: [:c], desc: 'Workload will be split between this number of threads'
    option :batch_size, type: :numeric, default: 100, aliases: [:b], desc: 'Number of records in each batch'
    option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
    option :import, type: :boolean, default: true, desc: 'Import data from the database to the index'
    option :clean, type: :boolean, default: true, desc: 'Remove outdated documents from the index'
    desc 'deploy', 'Create or upgrade Elasticsearch indices and populate them'
    long_desc <<~LONG_DESC
      If Elasticsearch is empty, this command will create the necessary indices
      and then import data from the database into those indices.

      This command will also upgrade indices if the underlying schema has been
      changed since the last run.
      changed since the last run. Index upgrades erase index data.

      Even if creating or upgrading indices is not necessary, data from the
      database will be imported into the indices.
      database will be imported into the indices, unless overriden with --no-import.
    LONG_DESC
    def deploy
      if options[:concurrency] < 1
@ -49,7 +51,9 @@ module Mastodon
        end
      end

      progress = ProgressBar.create(total: nil, format: '%t%c/%u |%b%i| %e (%r docs/s)', autofinish: false)
      pool      = Concurrent::FixedThreadPool.new(options[:concurrency], max_queue: options[:concurrency] * 10)
      importers = indices.index_with { |index| "Importer::#{index.name}Importer".constantize.new(batch_size: options[:batch_size], executor: pool) }
      progress  = ProgressBar.create(total: nil, format: '%t%c/%u |%b%i| %e (%r docs/s)', autofinish: false)

      # First, ensure all indices are created and have the correct
      # structure, so that live data can already be written
@ -59,99 +63,46 @@ module Mastodon
        index.specification.lock!
      end

      progress.title = 'Estimating workload '
      progress.total = indices.sum { |index| importers[index].estimate! }

      reset_connection_pools!

      pool    = Concurrent::FixedThreadPool.new(options[:concurrency])
      added   = Concurrent::AtomicFixnum.new(0)
      removed = Concurrent::AtomicFixnum.new(0)
      added   = 0
      removed = 0

      progress.title = 'Estimating workload '
      indices.each do |index|
        importer = importers[index]
        importer.optimize_for_import!

        importer.on_progress do |(indexed, deleted)|
          progress.total = nil if progress.progress + indexed + deleted > progress.total
          progress.progress += indexed + deleted
          added   += indexed
          removed += deleted
        end

      # Estimate the amount of data that has to be imported first
      progress.total = indices.sum { |index| index.adapter.default_scope.count }
        importer.on_failure do |reason|
          progress.log(pastel.red("Error while importing #{index}: #{reason}"))
        end

      # Now import all the actual data. Mind that unlike chewy:sync, we don't
      # fetch and compare all record IDs from the database and the index to
      # find out which to add and which to remove from the index. Because with
      # potentially millions of rows, the memory footprint of such a calculation
      # is uneconomical. So we only ever add.
      indices.each do |index|
        progress.title = "Importing #{index} "
        batch_size     = options[:batch_size]
        slice_size     = (batch_size / options[:concurrency]).ceil

        index.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|
          futures = []

          batch.each_slice(slice_size) do |records|
            futures << Concurrent::Future.execute(executor: pool) do
              begin
                if !progress.total.nil? && progress.progress + records.size > progress.total
                  # The number of items has changed between start and now,
                  # since there is no good way to predict the final count from
                  # here, just change the progress bar to an indeterminate one

                  progress.total = nil
                end

                grouped_records = nil
                bulk_body       = nil
                index_count     = 0
                delete_count    = 0

                ActiveRecord::Base.connection_pool.with_connection do
                  grouped_records = records.to_a.group_by do |record|
                    index.adapter.send(:delete_from_index?, record) ? :delete : :to_index
                  end

                  bulk_body = Chewy::Index::Import::BulkBuilder.new(index, **grouped_records).bulk_body
                end

                index_count  = grouped_records[:to_index].size  if grouped_records.key?(:to_index)
                delete_count = grouped_records[:delete].size    if grouped_records.key?(:delete)

                # The following is an optimization for statuses specifically, since
                # we want to de-index statuses that cannot be searched by anybody,
                # but can't use Chewy's delete_if logic because it doesn't use
                # crutches and our searchable_by logic depends on them
                if index == StatusesIndex
                  bulk_body.map! do |entry|
                    if entry[:to_index] && entry.dig(:to_index, :data, 'searchable_by').blank?
                      index_count  -= 1
                      delete_count += 1

                      { delete: entry[:to_index].except(:data) }
                    else
                      entry
                    end
                  end
                end

                Chewy::Index::Import::BulkRequest.new(index).perform(bulk_body)

                progress.progress += records.size

                added.increment(index_count)
                removed.increment(delete_count)

                sleep 1
              rescue => e
                progress.log pastel.red("Error importing #{index}: #{e}")
              ensure
                RedisConfiguration.pool.checkin if Thread.current[:redis]
                Thread.current[:redis] = nil
              end
            end
          end

          futures.map(&:value)
        if options[:import]
          progress.title = "Importing #{index} "
          importer.import!
        end

        if options[:clean]
          progress.title = "Cleaning #{index} "
          importer.clean_up!
        end
      ensure
        importer.optimize_for_search!
      end

      progress.title = ''
      progress.stop
      progress.title = 'Done! '
      progress.finish

      say("Indexed #{added.value} records, de-indexed #{removed.value}", :green, true)
      say("Indexed #{added} records, de-indexed #{removed}", :green, true)
    end
  end
 end