Change account search tokenizer and queries (#26378)

This commit is contained in:
Eugen Rochko 2023-08-08 09:09:14 +02:00 committed by GitHub
parent 60fbb0fe91
commit 72423bc8f6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 144 additions and 89 deletions

View file

@ -33,7 +33,7 @@ class AccountsIndex < Chewy::Index
},
verbatim: {
tokenizer: 'whitespace',
tokenizer: 'standard',
filter: %w(lowercase asciifolding cjk_width),
},

View file

@ -8,6 +8,143 @@ class AccountSearchService < BaseService
# Min. number of characters to look for non-exact matches
MIN_QUERY_LENGTH = 5
class QueryBuilder
def initialize(query, account, options = {})
@query = query
@account = account
@options = options
end
def build
AccountsIndex.query(
bool: {
must: {
function_score: {
query: {
bool: {
must: must_clauses,
},
},
functions: [
reputation_score_function,
followers_score_function,
time_distance_function,
],
},
},
should: should_clauses,
}
)
end
private
def must_clauses
if @account && @options[:following]
[core_query, only_following_query]
else
[core_query]
end
end
def should_clauses
if @account && !@options[:following]
[boost_following_query]
else
[]
end
end
# This function limits results to only the accounts the user is following
def only_following_query
{
terms: {
id: following_ids,
},
}
end
# This function promotes accounts the user is following
def boost_following_query
{
terms: {
id: following_ids,
boost: 100,
},
}
end
# This function deranks accounts that follow more people than follow them
def reputation_score_function
{
script_score: {
script: {
source: "(Math.max(doc['followers_count'].value, 0) + 0.0) / (Math.max(doc['followers_count'].value, 0) + Math.max(doc['following_count'].value, 0) + 1)",
},
},
}
end
# This function promotes accounts that have more followers
def followers_score_function
{
script_score: {
script: {
source: "(Math.max(doc['followers_count'].value, 0) / (Math.max(doc['followers_count'].value, 0) + 1))",
},
},
}
end
# This function deranks accounts that haven't posted in a long time
def time_distance_function
{
gauss: {
last_status_at: {
scale: '30d',
offset: '30d',
decay: 0.3,
},
},
}
end
def following_ids
@following_ids ||= @account.active_relationships.pluck(:target_account_id) + [@account.id]
end
end
class AutocompleteQueryBuilder < QueryBuilder
private
def core_query
{
multi_match: {
query: @query,
type: 'bool_prefix',
fields: %w(username username.* display_name display_name.*),
},
}
end
end
class FullQueryBuilder < QueryBuilder
private
def core_query
{
multi_match: {
query: @query,
type: 'most_fields',
fields: %w(username^2 display_name^2 text text.*),
operator: 'and',
},
}
end
end
def call(query, account = nil, options = {})
@query = query&.strip&.gsub(/\A@/, '')
@limit = options[:limit].to_i
@ -71,27 +208,15 @@ class AccountSearchService < BaseService
end
def from_elasticsearch
must_clauses = must_clause
should_clauses = should_clause
if account
return [] if options[:following] && following_ids.empty?
if options[:following]
must_clauses << { terms: { id: following_ids } }
elsif following_ids.any?
should_clauses << { terms: { id: following_ids, boost: 100 } }
query_builder = begin
if options[:use_searchable_text]
FullQueryBuilder.new(terms_for_query, account, options.slice(:following))
else
AutocompleteQueryBuilder.new(terms_for_query, account, options.slice(:following))
end
end
query = { bool: { must: must_clauses, should: should_clauses } }
functions = [reputation_score_function, followers_score_function, time_distance_function]
records = AccountsIndex.query(function_score: { query: query, functions: functions })
.limit(limit_for_non_exact_results)
.offset(offset)
.objects
.compact
records = query_builder.build.limit(limit_for_non_exact_results).offset(offset).objects.compact
ActiveRecord::Associations::Preloader.new(records: records, associations: :account_stat)
@ -100,76 +225,6 @@ class AccountSearchService < BaseService
nil
end
def reputation_score_function
{
script_score: {
script: {
source: "(Math.max(doc['followers_count'].value, 0) + 0.0) / (Math.max(doc['followers_count'].value, 0) + Math.max(doc['following_count'].value, 0) + 1)",
},
},
}
end
def followers_score_function
{
script_score: {
script: {
source: "Math.log10(Math.max(doc['followers_count'].value, 0) + 2)",
},
},
}
end
def time_distance_function
{
gauss: {
last_status_at: {
scale: '30d',
offset: '30d',
decay: 0.3,
},
},
}
end
def must_clause
if options[:start_with_hashtag]
fields = %w(text text.*)
else
fields = %w(username username.* display_name display_name.*)
fields << 'text' << 'text.*' if options[:use_searchable_text]
end
[
{
multi_match: {
query: terms_for_query,
fields: fields,
type: 'best_fields',
operator: 'or',
},
},
]
end
def should_clause
[
{
multi_match: {
query: terms_for_query,
fields: %w(username username.* display_name display_name.*),
type: 'best_fields',
operator: 'and',
boost: 10,
},
},
]
end
def following_ids
@following_ids ||= account.active_relationships.pluck(:target_account_id) + [account.id]
end
def limit_for_non_exact_results
return 0 if @account.nil? && query.size < MIN_QUERY_LENGTH