Change importers to avoid a few inefficiencies (#26721)

2024-11-16 20:22:18 +00:00 · 2023-08-31 19:04:27 +02:00 · 2023-08-31 19:04:27 +02:00 · 9bb2fb6b14
parent ecd76fa413
commit 9bb2fb6b14
6 changed files with 34 additions and 33 deletions
--- a/app/lib/importer/accounts_index_importer.rb
+++ b/app/lib/importer/accounts_index_importer.rb
@ -4,10 +4,10 @@ class Importer::AccountsIndexImporter < Importer::BaseImporter
  def import!
    scope.includes(:account_stat).find_in_batches(batch_size: @batch_size) do |tmp|
      in_work_unit(tmp) do |accounts|
-        bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: accounts).bulk_body
+        bulk = build_bulk_body(accounts)
-        indexed = bulk.count { |entry| entry[:index] }
+        indexed = bulk.size
-        deleted = bulk.count { |entry| entry[:delete] }
+        deleted = 0
        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)
--- a/app/lib/importer/base_importer.rb
+++ b/app/lib/importer/base_importer.rb
@ -68,6 +68,14 @@ class Importer::BaseImporter
  protected
  def build_bulk_body(to_import)
    # Specialize `Chewy::Index::Import::BulkBuilder#bulk_body` to avoid a few
    # inefficiencies, as none of our fields or join fields and we do not need
    # `BulkBuilder`'s versatility.
    crutches = Chewy::Index::Crutch::Crutches.new index, to_import
    to_import.map { |object| { index: { _id: object.id, data: index.compose(object, crutches, fields: []) } } }
  end
  def in_work_unit(...)
    work_unit = Concurrent::Promises.future_on(@executor, ...)
--- a/app/lib/importer/instances_index_importer.rb
+++ b/app/lib/importer/instances_index_importer.rb
@ -4,10 +4,10 @@ class Importer::InstancesIndexImporter < Importer::BaseImporter
  def import!
    index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp|
      in_work_unit(tmp) do |instances|
-        bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: instances).bulk_body
+        bulk = build_bulk_body(instances)
-        indexed = bulk.count { |entry| entry[:index] }
+        indexed = bulk.size
-        deleted = bulk.count { |entry| entry[:delete] }
+        deleted = 0
        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)
--- a/app/lib/importer/public_statuses_index_importer.rb
+++ b/app/lib/importer/public_statuses_index_importer.rb
@ -5,11 +5,11 @@ class Importer::PublicStatusesIndexImporter < Importer::BaseImporter
    scope.select(:id).find_in_batches(batch_size: @batch_size) do |batch|
      in_work_unit(batch.pluck(:id)) do |status_ids|
        bulk = ActiveRecord::Base.connection_pool.with_connection do
-          Chewy::Index::Import::BulkBuilder.new(index, to_index: Status.includes(:media_attachments, :preloadable_poll, :preview_cards).where(id: status_ids)).bulk_body
+          build_bulk_body(index.adapter.default_scope.where(id: status_ids))
        end
-        indexed = bulk.count { |entry| entry[:index] }
+        indexed = bulk.size
-        deleted = bulk.count { |entry| entry[:delete] }
+        deleted = 0
        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)
--- a/app/lib/importer/statuses_index_importer.rb
+++ b/app/lib/importer/statuses_index_importer.rb
@ -13,32 +13,25 @@ class Importer::StatusesIndexImporter < Importer::BaseImporter
      scope.find_in_batches(batch_size: @batch_size) do |tmp|
        in_work_unit(tmp.map(&:status_id)) do |status_ids|
          bulk = ActiveRecord::Base.connection_pool.with_connection do
            Chewy::Index::Import::BulkBuilder.new(index, to_index: index.adapter.default_scope.where(id: status_ids)).bulk_body
          end
          indexed = 0
          deleted = 0
-          # We can't use the delete_if proc to do the filtering because delete_if
+          bulk = ActiveRecord::Base.connection_pool.with_connection do
-          # is called before rendering the data and we need to filter based
+            to_index = index.adapter.default_scope.where(id: status_ids)
-          # on the results of the filter, so this filtering happens here instead
+            crutches = Chewy::Index::Crutch::Crutches.new index, to_index
-          bulk.map! do |entry|
+            to_index.map do |object|
-            new_entry = if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank?
+              # This is unlikely to happen, but the post may have been
-                          { delete: entry[:index].except(:data) }
+              # un-interacted with since it was queued for indexing
-                        else
+              if object.searchable_by.empty?
-                          entry
+                deleted += 1
-                        end
+                { delete: { _id: object.id } }
-
+              else
-            if new_entry[:index]
+                { index: { _id: object.id, data: index.compose(object, crutches, fields: []) } }
-              indexed += 1
+              end
            else
              deleted += 1
            end
            new_entry
          end
          indexed = bulk.size - deleted
          Chewy::Index::Import::BulkRequest.new(index).perform(bulk)
          [indexed, deleted]
--- a/app/lib/importer/tags_index_importer.rb
+++ b/app/lib/importer/tags_index_importer.rb
@ -4,10 +4,10 @@ class Importer::TagsIndexImporter < Importer::BaseImporter
  def import!
    index.adapter.default_scope.find_in_batches(batch_size: @batch_size) do |tmp|
      in_work_unit(tmp) do |tags|
-        bulk = Chewy::Index::Import::BulkBuilder.new(index, to_index: tags).bulk_body
+        bulk = build_bulk_body(tags)
-        indexed = bulk.count { |entry| entry[:index] }
+        indexed = bulk.size
-        deleted = bulk.count { |entry| entry[:delete] }
+        deleted = 0
        Chewy::Index::Import::BulkRequest.new(index).perform(bulk)