Change spam check to apply to local accounts and add a threshold (#11806)

Instead of detecting spam on first duplicate message, add a
threshold of 5 such messages to reduce false positives
This commit is contained in:
Eugen Rochko 2019-09-18 12:53:13 +02:00 committed by GitHub
parent 577706987d
commit 4f6af87906
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 29 deletions

View file

@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
end end
def check_for_spam def check_for_spam
spam_check = SpamCheck.new(@status) SpamCheck.perform(@status)
return if spam_check.skip?
if spam_check.spam?
spam_check.flag!
else
spam_check.remember!
end
end end
def forward_for_reply def forward_for_reply

View file

@ -4,10 +4,26 @@ class SpamCheck
include Redisable include Redisable
include ActionView::Helpers::TextHelper include ActionView::Helpers::TextHelper
# Threshold over which two Nilsimsa values are considered
# to refer to the same text
NILSIMSA_COMPARE_THRESHOLD = 95 NILSIMSA_COMPARE_THRESHOLD = 95
# Nilsimsa doesn't work well on small inputs, so below
# this size, we check only for exact matches with MD5
NILSIMSA_MIN_SIZE = 10 NILSIMSA_MIN_SIZE = 10
# How long to keep the trail of digests between updates,
# there is no reason to store it forever
EXPIRE_SET_AFTER = 1.week.seconds EXPIRE_SET_AFTER = 1.week.seconds
# How many digests to keep in an account's trail. If it's
# too small, spam could rotate around different message templates
MAX_TRAIL_SIZE = 10
# How many detected duplicates to allow through before
# considering the message as spam
THRESHOLD = 5
def initialize(status) def initialize(status)
@account = status.account @account = status.account
@status = status @status = status
@ -21,9 +37,9 @@ class SpamCheck
if insufficient_data? if insufficient_data?
false false
elsif nilsimsa? elsif nilsimsa?
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
else else
any_other_digest?('md5') { |_, other_digest| other_digest == digest } digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
end end
end end
@ -38,7 +54,7 @@ class SpamCheck
# get the correct status ID back, we have to save it in the string value # get the correct status ID back, we have to save it in the string value
redis.zadd(redis_key, @status.id, digest_with_algorithm) redis.zadd(redis_key, @status.id, digest_with_algorithm)
redis.zremrangebyrank(redis_key, '0', '-10') redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
redis.expire(redis_key, EXPIRE_SET_AFTER) redis.expire(redis_key, EXPIRE_SET_AFTER)
end end
@ -78,6 +94,20 @@ class SpamCheck
end end
end end
class << self
def perform(status)
spam_check = new(status)
return if spam_check.skip?
if spam_check.spam?
spam_check.flag!
else
spam_check.remember!
end
end
end
private private
def disabled? def disabled?
@ -149,14 +179,14 @@ class SpamCheck
redis.zrange(redis_key, 0, -1) redis.zrange(redis_key, 0, -1)
end end
def any_other_digest?(filter_algorithm) def digests_over_threshold?(filter_algorithm)
other_digests.any? do |record| other_digests.select do |record|
algorithm, other_digest, status_id = record.split(':') algorithm, other_digest, status_id = record.split(':')
next unless algorithm == filter_algorithm next unless algorithm == filter_algorithm
yield algorithm, other_digest, status_id yield algorithm, other_digest, status_id
end end.size >= THRESHOLD
end end
def matching_status_ids def matching_status_ids

View file

@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService
end end
status.save! status.save!
check_for_spam(status)
mentions.each { |mention| create_notification(mention) } mentions.each { |mention| create_notification(mention) }
end end
@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService
def resolve_account_service def resolve_account_service
ResolveAccountService.new ResolveAccountService.new
end end
def check_for_spam(status)
SpamCheck.perform(status)
end
end end

View file

@ -86,23 +86,33 @@ RSpec.describe SpamCheck do
end end
it 'returns true for duplicate statuses to the same recipient' do it 'returns true for duplicate statuses to the same recipient' do
described_class::THRESHOLD.times do
status1 = status_with_html('@alice Hello') status1 = status_with_html('@alice Hello')
described_class.new(status1).remember! described_class.new(status1).remember!
end
status2 = status_with_html('@alice Hello') status2 = status_with_html('@alice Hello')
expect(described_class.new(status2).spam?).to be true expect(described_class.new(status2).spam?).to be true
end end
it 'returns true for duplicate statuses to different recipients' do it 'returns true for duplicate statuses to different recipients' do
described_class::THRESHOLD.times do
status1 = status_with_html('@alice Hello') status1 = status_with_html('@alice Hello')
described_class.new(status1).remember! described_class.new(status1).remember!
end
status2 = status_with_html('@bob Hello') status2 = status_with_html('@bob Hello')
expect(described_class.new(status2).spam?).to be true expect(described_class.new(status2).spam?).to be true
end end
it 'returns true for nearly identical statuses with random numbers' do it 'returns true for nearly identical statuses with random numbers' do
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
described_class::THRESHOLD.times do
status1 = status_with_html('@alice ' + source_text + ' 1234') status1 = status_with_html('@alice ' + source_text + ' 1234')
described_class.new(status1).remember! described_class.new(status1).remember!
end
status2 = status_with_html('@bob ' + source_text + ' 9568') status2 = status_with_html('@bob ' + source_text + ' 9568')
expect(described_class.new(status2).spam?).to be true expect(described_class.new(status2).spam?).to be true
end end
@ -140,9 +150,9 @@ RSpec.describe SpamCheck do
let(:redis_key) { spam_check.send(:redis_key) } let(:redis_key) { spam_check.send(:redis_key) }
it 'remembers' do it 'remembers' do
expect do expect(Redis.current.exists(redis_key)).to be true
spam_check.remember! spam_check.remember!
end.to change { Redis.current.exists(redis_key) }.from(false).to(true) expect(Redis.current.exists(redis_key)).to be true
end end
end end
@ -156,9 +166,9 @@ RSpec.describe SpamCheck do
end end
it 'resets' do it 'resets' do
expect do expect(Redis.current.exists(redis_key)).to be true
spam_check.reset! spam_check.reset!
end.to change { Redis.current.exists(redis_key) }.from(true).to(false) expect(Redis.current.exists(redis_key)).to be false
end end
end end