Skip to content

Commit

Permalink
switch to redis server side script for failure percent counter
Browse files Browse the repository at this point in the history
This movies from using multiple redis calls in ruby to redis server
side lua scripts for the rolling counter and the math contained in
FailurePercentCounter. This provides atomic operation over all the set
of redis calls and should provides efficiency over multiple separate
call for the data.

closes CNVS-34236
ref CNVS-34031

test plan:
 - Ensure canvas is configured to use redis
 - In the rails console do the following:
    Setting.set('service_qatesting_timeout', 1)
    Setting.set('service_qatesting_timeout_protection_method',
                'percentage')
    Setting.set('service_qatesting_min_samples', 10)
 - The following must be completed in the rails console in under a
   minute:
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { sleep 2 }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { sleep 2 }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.redis.ttl("service:timeouts:qatesting:percent_counter:protection_activated")
 - Note that after each 'sleep' above that an error report of type
   'service_timeout' was generated
 - Note that after 10 samples, we went into timeout protection by
   the log message of "Skipping service call due to error count: qatesting 0.2"
 - Note that the Canvas.redis call returns a number between 0 and 60

Change-Id: Ic04eaab4edb49518e47538feda06dd32a32b49ec
Reviewed-on: https://gerrit.instructure.com/99764
Tested-by: Jenkins
Reviewed-by: Shahbaz Javeed <[email protected]>
Reviewed-by: Spencer Olson <[email protected]>
Reviewed-by: Brian Palmer <[email protected]>
QA-Review: KC Naegle <[email protected]>
Product-Review: Keith T. Garner <[email protected]>
  • Loading branch information
ktgeek committed Jan 19, 2017
1 parent f011a67 commit 080d4e2
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 30 deletions.
31 changes: 10 additions & 21 deletions lib/canvas/failure_percent_counter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ def initialize(redis, redis_key, period = 60.seconds, min_samples = 100)
@min_samples = min_samples
end

def self.lua
@lua ||= ::Redis::Scripting::Module.new(nil,
File.join(File.dirname(__FILE__), "failure_percent_counter"))
end

def increment_count
increment(@count_key)
end
Expand All @@ -35,32 +40,16 @@ def increment_failure

def failure_rate
now = Time.now.utc.to_i
# ideally we'd want to do all the redis calls in a redis.multi
# so they are atomic, but canvas uses an abstraction layer that
# doesn't expose that
count = cleanup_and_get(@count_key, now)
failure = cleanup_and_get(@fail_key, now)

# If our sample size is too small, let's claim total success
return 0.0 if count < @min_samples
failure.fdiv(count)
result = FailurePercentCounter.lua.run(:failure_rate, [@count_key],
[@fail_key, now, @period, @min_samples], @redis)
result.to_f
end

private
def increment(key)
now = Time.now.utc.to_i
# ideally we'd want to do all the redis calls in a redis.multi
# so they are atomic, but canvas uses an abstraction layer that
# doesn't expose that
@redis.zadd(key, now, SecureRandom.uuid)
@redis.expire(key, @period.ceil)
cleanup_and_get(key, now)
end

def cleanup_and_get(key, now)
cleanup_time = now - @period
@redis.zremrangebyscore(key, 0, cleanup_time)
@redis.zcount(key, cleanup_time, now)
FailurePercentCounter.lua.run(:increment_counter, [@count_key],
[key, now, SecureRandom.uuid, @period.ceil], @redis)
end
end
end
26 changes: 26 additions & 0 deletions lib/canvas/failure_percent_counter/failure_rate.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- This assumes the ordered sets are being used as a rolling counter
-- and then does math on the size of the two sets
local count_key = KEYS[1]
-- Because we have a distributed redis ring, we'll have issues if both
-- keys are in the key field, so we'll use count, which should always
-- be bigger as the main key, and pass the fail_key as our first
-- argument.
local fail_key = ARGV[1]
local current_time = tonumber(ARGV[2])
local period = tonumber(ARGV[3])
local min_samples = tonumber(ARGV[4])

local cleanup_time = current_time - period

redis.call('ZREMRANGEBYSCORE', count_key, 0, cleanup_time)
local count = redis.call('ZCOUNT', count_key, cleanup_time, current_time)

if count < min_samples then
return '0.0'
end

redis.call('ZREMRANGEBYSCORE', fail_key, 0, cleanup_time)
local failures = redis.call('ZCOUNT', fail_key, cleanup_time, current_time)

-- redis converts lua numbers to ints, so we have to return a float as a string
return tostring(failures / count)
12 changes: 12 additions & 0 deletions lib/canvas/failure_percent_counter/increment_counter.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- This implements a rolling counter in redis using ordered sets
local cache_key = ARGV[1]
local current_time = tonumber(ARGV[2])
local random_value = ARGV[3]
local period = tonumber(ARGV[4])

local cleanup_time = current_time - period

redis.call('ZADD', cache_key, 'NX', current_time, random_value)
redis.call('EXPIRE', cache_key, period)
redis.call('ZREMRANGEBYSCORE', cache_key, 0, cleanup_time)
return redis.call('ZCOUNT', cache_key, cleanup_time, current_time)
9 changes: 0 additions & 9 deletions spec/lib/canvas/failure_percent_counter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,6 @@
expect(counter.increment_count).to eq(3)
end

it "has a functional rolling counter implementation" do
# This four calls in order is what makes a rolling counter
Canvas.redis.expects(:zadd)
Canvas.redis.expects(:expire)
Canvas.redis.expects(:zremrangebyscore)
Canvas.redis.expects(:zcount).returns(1)
counter.increment_count
end

it "increments the failure counter" do
counter.increment_failure
expect(counter.increment_failure).to eq(2)
Expand Down

0 comments on commit 080d4e2

Please sign in to comment.