closed-social
/
mastodon

# frozen_string_literal: true
require_relative '../../config/boot'require_relative '../../config/environment'require_relative 'cli_helper'
module Mastodon  class SearchCLI < Thor    include CLIHelper
    # Indices are sorted by amount of data to be expected in each, so that    # smaller indices can go online sooner    INDICES = [      AccountsIndex,      TagsIndex,      StatusesIndex,    ].freeze
    option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'    option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'    desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'    long_desc <<~LONG_DESC      If ElasticSearch is empty, this command will create the necessary indices      and then import data from the database into those indices.
      This command will also upgrade indices if the underlying schema has been      changed since the last run.
      Even if creating or upgrading indices is not necessary, data from the      database will be imported into the indices.    LONG_DESC    def deploy      if options[:concurrency] < 1
        say('Cannot run with this concurrency setting, must be at least 1', :red)        exit(1)      end
      indices = begin        if options[:only]          options[:only].map { |str| "#{str.camelize}Index".constantize }        else          INDICES        end      end
      progress = ProgressBar.create(total: nil, format: '%t%c/%u |%b%i| %e (%r docs/s)', autofinish: false)
      # First, ensure all indices are created and have the correct      # structure, so that live data can already be written      indices.select { |index| index.specification.changed? }.each do |index|        progress.title = "Upgrading #{index} "        index.purge        index.specification.lock!      end
      db_config = ActiveRecord::Base.configurations[Rails.env].dup      db_config['pool'] = options[:concurrency] + 1
      ActiveRecord::Base.establish_connection(db_config)
      pool    = Concurrent::FixedThreadPool.new(options[:concurrency])      added   = Concurrent::AtomicFixnum.new(0)      removed = Concurrent::AtomicFixnum.new(0)
      progress.title = 'Estimating workload '
      # Estimate the amount of data that has to be imported first      indices.each do |index|        index.types.each do |type|          progress.total = (progress.total || 0) + type.adapter.default_scope.count        end      end
      # Now import all the actual data. Mind that unlike chewy:sync, we don't      # fetch and compare all record IDs from the database and the index to      # find out which to add and which to remove from the index. Because with      # potentially millions of rows, the memory footprint of such a calculation      # is uneconomical. So we only ever add.      indices.each do |index|        progress.title = "Importing #{index} "        batch_size     = 1_000
        slice_size     = (batch_size / options[:concurrency]).ceil
        index.types.each do |type|          type.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|            futures = []
            batch.each_slice(slice_size) do |records|              futures << Concurrent::Future.execute(executor: pool) do                begin                  if !progress.total.nil? && progress.progress + records.size > progress.total                    # The number of items has changed between start and now,                    # since there is no good way to predict the final count from                    # here, just change the progress bar to an indeterminate one
                    progress.total = nil                  end
                  grouped_records = nil                  bulk_body       = nil                  index_count     = 0
                  delete_count    = 0

                  ActiveRecord::Base.connection_pool.with_connection do                    grouped_records = type.adapter.send(:grouped_objects, records)                    bulk_body       = Chewy::Type::Import::BulkBuilder.new(type, **grouped_records).bulk_body                  end
                  index_count  = grouped_records[:index].size  if grouped_records.key?(:index)                  delete_count = grouped_records[:delete].size if grouped_records.key?(:delete)
                  # The following is an optimization for statuses specifically, since                  # we want to de-index statuses that cannot be searched by anybody,                  # but can't use Chewy's delete_if logic because it doesn't use                  # crutches and our searchable_by logic depends on them                  if type == StatusesIndex::Status                    bulk_body.map! do |entry|                      if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank?                        index_count  -= 1
                        delete_count += 1

                        { delete: entry[:index].except(:data) }                      else                        entry                      end                    end                  end
                  Chewy::Type::Import::BulkRequest.new(type).perform(bulk_body)
                  progress.progress += records.size
                  added.increment(index_count)                  removed.increment(delete_count)
                  sleep 1
                rescue => e                  progress.log pastel.red("Error importing #{index}: #{e}")                end              end            end
            futures.map(&:value)          end        end      end
      progress.title = ''      progress.stop
      say("Indexed #{added.value} records, de-indexed #{removed.value}", :green, true)    end  endend