Add more accurate hashtag search (#11579)

* Add more accurate hashtag search

Using ElasticSearch to index hashtags with edge n-grams and score
them by usage within the last 7 days since last activity. Only
hashtags that have been reviewed and are listable can appear in
searches, unless they match the query exactly

* Fix search analyzer dropping non-ascii characters
This commit is contained in:
Eugen Rochko 2019-08-18 03:45:51 +02:00 committed by GitHub
parent 3a77090d01
commit cc0a55cf9a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 149 additions and 13 deletions

37
app/chewy/tags_index.rb Normal file
View file

@ -0,0 +1,37 @@
# frozen_string_literal: true
class TagsIndex < Chewy::Index
settings index: { refresh_interval: '15m' }, analysis: {
analyzer: {
content: {
tokenizer: 'keyword',
filter: %w(lowercase asciifolding cjk_width),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
},
},
tokenizer: {
edge_ngram: {
type: 'edge_ngram',
min_gram: 2,
max_gram: 15,
},
},
}
define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
root date_detection: false do
field :name, type: 'text', analyzer: 'content' do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
end
field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
end
end
end

View file

@ -13,6 +13,8 @@
# listable :boolean # listable :boolean
# reviewed_at :datetime # reviewed_at :datetime
# requested_review_at :datetime # requested_review_at :datetime
# last_status_at :datetime
# last_trend_at :datetime
# #
class Tag < ApplicationRecord class Tag < ApplicationRecord
@ -33,7 +35,8 @@ class Tag < ApplicationRecord
scope :unreviewed, -> { where(reviewed_at: nil) } scope :unreviewed, -> { where(reviewed_at: nil) }
scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) } scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
scope :usable, -> { where(usable: [true, nil]) } scope :usable, -> { where(usable: [true, nil]) }
scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) } scope :listable, -> { where(listable: [true, nil]) }
scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) } scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
delegate :accounts_count, delegate :accounts_count,
@ -44,6 +47,8 @@ class Tag < ApplicationRecord
after_save :save_account_tag_stat after_save :save_account_tag_stat
update_index('tags#tag', :self) if Chewy.enabled?
def account_tag_stat def account_tag_stat
super || build_account_tag_stat super || build_account_tag_stat
end end
@ -121,9 +126,10 @@ class Tag < ApplicationRecord
normalized_term = normalize(term.strip).mb_chars.downcase.to_s normalized_term = normalize(term.strip).mb_chars.downcase.to_s
pattern = sanitize_sql_like(normalized_term) + '%' pattern = sanitize_sql_like(normalized_term) + '%'
Tag.where(arel_table[:name].lower.matches(pattern)) Tag.listable
.where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term))) .where(arel_table[:name].lower.matches(pattern))
.order(Arel.sql('length(name) ASC, score DESC, name ASC')) .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
.order(Arel.sql('length(name) ASC, name ASC'))
.limit(limit) .limit(limit)
.offset(offset) .offset(offset)
end end

View file

@ -17,6 +17,9 @@ class TrendingTags
increment_historical_use!(tag.id, at_time) increment_historical_use!(tag.id, at_time)
increment_unique_use!(tag.id, account.id, at_time) increment_unique_use!(tag.id, account.id, at_time)
increment_vote!(tag, at_time) increment_vote!(tag, at_time)
tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
tag.update(last_trend_at: Time.now.utc) if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
end end
def get(limit, filtered: true) def get(limit, filtered: true)

View file

@ -109,7 +109,7 @@ class AccountSearchService < BaseService
field_value_factor: { field_value_factor: {
field: 'followers_count', field: 'followers_count',
modifier: 'log2p', modifier: 'log2p',
missing: 1, missing: 0,
}, },
} }
end end

View file

@ -57,10 +57,10 @@ class SearchService < BaseService
end end
def perform_hashtags_search! def perform_hashtags_search!
Tag.search_for( TagSearchService.new.call(
@query.gsub(/\A#/, ''), @query,
@limit, limit: @limit,
@offset offset: @offset
) )
end end

View file

@ -0,0 +1,82 @@
# frozen_string_literal: true
class TagSearchService < BaseService
def call(query, options = {})
@query = query.strip.gsub(/\A#/, '')
@offset = options[:offset].to_i
@limit = options[:limit].to_i
if Chewy.enabled?
from_elasticsearch
else
from_database
end
end
private
def from_elasticsearch
query = {
function_score: {
query: {
multi_match: {
query: @query,
fields: %w(name.edge_ngram name),
type: 'most_fields',
operator: 'and',
},
},
functions: [
{
field_value_factor: {
field: 'usage',
modifier: 'log2p',
missing: 0,
},
},
{
gauss: {
last_status_at: {
scale: '7d',
offset: '14d',
decay: 0.5,
},
},
},
],
boost_mode: 'multiply',
},
}
filter = {
bool: {
should: [
{
term: {
reviewed: {
value: true,
},
},
},
{
term: {
name: {
value: @query,
},
},
},
],
},
}
TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
end
def from_database
Tag.search_for(@query, @limit, @offset)
end
end

View file

@ -142,7 +142,7 @@ en:
report: Send e-mail when a new report is submitted report: Send e-mail when a new report is submitted
trending_tag: Send e-mail when an unreviewed hashtag is trending trending_tag: Send e-mail when an unreviewed hashtag is trending
tag: tag:
listable: Allow this hashtag to appear on the profile directory listable: Allow this hashtag to appear in searches and on the profile directory
trendable: Allow this hashtag to appear under trends trendable: Allow this hashtag to appear under trends
usable: Allow toots to use this hashtag usable: Allow toots to use this hashtag
'no': 'No' 'no': 'No'

View file

@ -0,0 +1,6 @@
class AddLastStatusAtToTags < ActiveRecord::Migration[5.2]
def change
add_column :tags, :last_status_at, :datetime
add_column :tags, :last_trend_at, :datetime
end
end

View file

@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2019_08_07_135426) do ActiveRecord::Schema.define(version: 2019_08_15_225426) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do
t.boolean "listable" t.boolean "listable"
t.datetime "reviewed_at" t.datetime "reviewed_at"
t.datetime "requested_review_at" t.datetime "requested_review_at"
t.datetime "last_status_at"
t.datetime "last_trend_at"
t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
end end

View file

@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do
end end
it 'finds the exact matching tag as the first item' do it 'finds the exact matching tag as the first item' do
similar_tag = Fabricate(:tag, name: "matchlater", score: 1) similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
tag = Fabricate(:tag, name: "match", score: 1) tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
results = Tag.search_for("match") results = Tag.search_for("match")