Change importers to avoid a few inefficiencies (#26721)

This commit is contained in:
Claire 2023-08-31 19:04:27 +02:00 committed by GitHub
parent ecd76fa413
commit 9bb2fb6b14
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 34 additions and 33 deletions

View file

@ -4,10 +4,10 @@ class Importer::AccountsIndexImporter < Importer::BaseImporter
def import! def import!
scope.includes(:account_stat).find_in_batches(batch_size: @batch_size) do |tmp| scope.includes(:account_stat).find_in_batches(batch_size: @batch_size) do |tmp|
in_work_unit(tmp) do |accounts| in_work_unit(tmp) do |accounts|
bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: accounts).bulk_body bulk = build_bulk_body(accounts)
indexed = bulk.count { |entry| entry[:index] } indexed = bulk.size
deleted = bulk.count { |entry| entry[:delete] } deleted = 0
Chewy::Index::Import::BulkRequest.new(index).perform(bulk) Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

View file

@ -68,6 +68,14 @@ class Importer::BaseImporter
protected protected
def build_bulk_body(to_import)
# Specialize `Chewy::Index::Import::BulkBuilder#bulk_body` to avoid a few
# inefficiencies, as none of our fields or join fields and we do not need
# `BulkBuilder`'s versatility.
crutches = Chewy::Index::Crutch::Crutches.new index, to_import
to_import.map { |object| { index: { _id: object.id, data: index.compose(object, crutches, fields: []) } } }
end
def in_work_unit(...) def in_work_unit(...)
work_unit = Concurrent::Promises.future_on(@executor, ...) work_unit = Concurrent::Promises.future_on(@executor, ...)

View file

@ -4,10 +4,10 @@ class Importer::InstancesIndexImporter < Importer::BaseImporter
def import! def import!
index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp| index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp|
in_work_unit(tmp) do |instances| in_work_unit(tmp) do |instances|
bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: instances).bulk_body bulk = build_bulk_body(instances)
indexed = bulk.count { |entry| entry[:index] } indexed = bulk.size
deleted = bulk.count { |entry| entry[:delete] } deleted = 0
Chewy::Index::Import::BulkRequest.new(index).perform(bulk) Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

View file

@ -5,11 +5,11 @@ class Importer::PublicStatusesIndexImporter < Importer::BaseImporter
scope.select(:id).find_in_batches(batch_size: @batch_size) do |batch| scope.select(:id).find_in_batches(batch_size: @batch_size) do |batch|
in_work_unit(batch.pluck(:id)) do |status_ids| in_work_unit(batch.pluck(:id)) do |status_ids|
bulk = ActiveRecord::Base.connection_pool.with_connection do bulk = ActiveRecord::Base.connection_pool.with_connection do
Chewy::Index::Import::BulkBuilder.new(index, to_index: Status.includes(:media_attachments, :preloadable_poll, :preview_cards).where(id: status_ids)).bulk_body build_bulk_body(index.adapter.default_scope.where(id: status_ids))
end end
indexed = bulk.count { |entry| entry[:index] } indexed = bulk.size
deleted = bulk.count { |entry| entry[:delete] } deleted = 0
Chewy::Index::Import::BulkRequest.new(index).perform(bulk) Chewy::Index::Import::BulkRequest.new(index).perform(bulk)

View file

@ -13,32 +13,25 @@ class Importer::StatusesIndexImporter < Importer::BaseImporter
scope.find_in_batches(batch_size: @batch_size) do |tmp| scope.find_in_batches(batch_size: @batch_size) do |tmp|
in_work_unit(tmp.map(&:status_id)) do |status_ids| in_work_unit(tmp.map(&:status_id)) do |status_ids|
bulk = ActiveRecord::Base.connection_pool.with_connection do
Chewy::Index::Import::BulkBuilder.new(index, to_index: index.adapter.default_scope.where(id: status_ids)).bulk_body
end
indexed = 0
deleted = 0 deleted = 0
# We can't use the delete_if proc to do the filtering because delete_if bulk = ActiveRecord::Base.connection_pool.with_connection do
# is called before rendering the data and we need to filter based to_index = index.adapter.default_scope.where(id: status_ids)
# on the results of the filter, so this filtering happens here instead crutches = Chewy::Index::Crutch::Crutches.new index, to_index
bulk.map! do |entry| to_index.map do |object|
new_entry = if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank? # This is unlikely to happen, but the post may have been
{ delete: entry[:index].except(:data) } # un-interacted with since it was queued for indexing
else if object.searchable_by.empty?
entry deleted += 1
end { delete: { _id: object.id } }
else
if new_entry[:index] { index: { _id: object.id, data: index.compose(object, crutches, fields: []) } }
indexed += 1 end
else
deleted += 1
end end
new_entry
end end
indexed = bulk.size - deleted
Chewy::Index::Import::BulkRequest.new(index).perform(bulk) Chewy::Index::Import::BulkRequest.new(index).perform(bulk)
[indexed, deleted] [indexed, deleted]

View file

@ -4,10 +4,10 @@ class Importer::TagsIndexImporter < Importer::BaseImporter
def import! def import!
index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp| index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp|
in_work_unit(tmp) do |tags| in_work_unit(tmp) do |tags|
bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: tags).bulk_body bulk = build_bulk_body(tags)
indexed = bulk.count { |entry| entry[:index] } indexed = bulk.size
deleted = bulk.count { |entry| entry[:delete] } deleted = 0
Chewy::Index::Import::BulkRequest.new(index).perform(bulk) Chewy::Index::Import::BulkRequest.new(index).perform(bulk)