Skip to content

Commit 7bf91f4

Browse files
smessierraphael
authored andcommitted
tracing with adaptive sampler (goadesign#1217)
* set xray.Segment.Error to true when recording error unless Fault or Throttle already set * backport adaptive sampling from v2 tracer middleware maintains but deprecates existing Tracer middleware signature separated sampling algorithm as Sampler interface for use outside of middleware * CR change fixedSampler from struct to int
1 parent 2af6d3f commit 7bf91f4

File tree

4 files changed

+411
-51
lines changed

4 files changed

+411
-51
lines changed

middleware/sampler.go

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package middleware
2+
3+
import (
4+
"math/rand"
5+
"sync"
6+
"sync/atomic"
7+
"time"
8+
)
9+
10+
type (
11+
// Sampler is an interface for computing when a sample falls within a range.
12+
Sampler interface {
13+
// Sample returns true if the caller should sample now.
14+
Sample() bool
15+
}
16+
17+
adaptiveSampler struct {
18+
sync.Mutex
19+
lastRate int64
20+
maxSamplingRate int
21+
sampleSize uint32
22+
start time.Time
23+
counter uint32
24+
}
25+
26+
fixedSampler int
27+
)
28+
29+
const (
30+
// adaptive upper bound has granularity in case caller becomes extremely busy.
31+
adaptiveUpperBoundInt = 10000
32+
adaptiveUpperBoundFloat = float64(adaptiveUpperBoundInt)
33+
)
34+
35+
// NewAdaptiveSampler computes the interval for sampling for tracing middleware.
36+
// it can also be used by non-web go routines to trace internal API calls.
37+
//
38+
// maxSamplingRate is the desired maximum sampling rate in requests per second.
39+
//
40+
// sampleSize sets the number of requests between two adjustments of the
41+
// sampling rate when MaxSamplingRate is set. the sample rate cannot be adjusted
42+
// until the sample size is reached at least once.
43+
func NewAdaptiveSampler(maxSamplingRate, sampleSize int) Sampler {
44+
if maxSamplingRate <= 0 {
45+
panic("maxSamplingRate must be greater than 0")
46+
}
47+
if sampleSize <= 0 {
48+
panic("sample size must be greater than 0")
49+
}
50+
return &adaptiveSampler{
51+
lastRate: adaptiveUpperBoundInt, // samples all until initial count reaches sample size
52+
maxSamplingRate: maxSamplingRate,
53+
sampleSize: uint32(sampleSize),
54+
start: time.Now(),
55+
}
56+
}
57+
58+
// NewFixedSampler sets the tracing sampling rate as a percentage value.
59+
func NewFixedSampler(samplingPercent int) Sampler {
60+
if samplingPercent < 0 || samplingPercent > 100 {
61+
panic("samplingPercent must be between 0 and 100")
62+
}
63+
return fixedSampler(samplingPercent)
64+
}
65+
66+
// Sample implementation for adaptive rate
67+
func (s *adaptiveSampler) Sample() bool {
68+
// adjust sampling rate whenever sample size is reached.
69+
var currentRate int
70+
if atomic.AddUint32(&s.counter, 1) == s.sampleSize { // exact match prevents
71+
atomic.StoreUint32(&s.counter, 0) // race is ok
72+
s.Lock()
73+
{
74+
d := time.Since(s.start).Seconds()
75+
r := float64(s.sampleSize) / d
76+
currentRate = int((float64(s.maxSamplingRate) * adaptiveUpperBoundFloat) / r)
77+
if currentRate > adaptiveUpperBoundInt {
78+
currentRate = adaptiveUpperBoundInt
79+
} else if currentRate < 1 {
80+
currentRate = 1
81+
}
82+
s.start = time.Now()
83+
}
84+
s.Unlock()
85+
atomic.StoreInt64(&s.lastRate, int64(currentRate))
86+
} else {
87+
currentRate = int(atomic.LoadInt64(&s.lastRate))
88+
}
89+
90+
// currentRate is never zero.
91+
return currentRate == adaptiveUpperBoundInt || rand.Intn(adaptiveUpperBoundInt) < currentRate
92+
}
93+
94+
// Sample implementation for fixed percentage
95+
func (s fixedSampler) Sample() bool {
96+
samplingPercent := int(s)
97+
return samplingPercent > 0 && (samplingPercent == 100 || rand.Intn(100) < samplingPercent)
98+
}

middleware/sampler_test.go

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package middleware
2+
3+
import (
4+
"math/rand"
5+
"testing"
6+
"time"
7+
)
8+
9+
func TestFixedSampler(t *testing.T) {
10+
// 0 %
11+
subject := NewFixedSampler(0)
12+
for i := 0; i < 10; i++ {
13+
if subject.Sample() {
14+
t.Errorf("%d: Sample() returned true for 0%%", i)
15+
}
16+
}
17+
18+
// 100 %
19+
subject = NewFixedSampler(100)
20+
for i := 0; i < 10; i++ {
21+
if !subject.Sample() {
22+
t.Errorf("%d: Sample() returned false for 100%%", i)
23+
}
24+
}
25+
26+
// 50 %
27+
rand.Seed(123) // set seed for reproducibility.
28+
trueCount := 0
29+
subject = NewFixedSampler(33)
30+
for i := 0; i < 100; i++ {
31+
if subject.Sample() {
32+
trueCount++
33+
}
34+
}
35+
if trueCount != 30 {
36+
t.Errorf("Unexpected trueCount: %d", trueCount)
37+
}
38+
39+
// 66 %
40+
trueCount = 0
41+
subject = NewFixedSampler(66)
42+
for i := 0; i < 100; i++ {
43+
if subject.Sample() {
44+
trueCount++
45+
}
46+
}
47+
if trueCount != 67 {
48+
t.Errorf("Unexpected trueCount: %d", trueCount)
49+
}
50+
}
51+
52+
func TestAdaptiveSampler(t *testing.T) {
53+
// initial sampling
54+
subject := NewAdaptiveSampler(1, 100)
55+
for i := 0; i < 99; i++ {
56+
if !subject.Sample() {
57+
t.Errorf("%d: Sample() returned false before reaching sample size", i)
58+
}
59+
}
60+
61+
// change start time to 1s ago for a more predictable result.
62+
trueCount := 0
63+
rand.Seed(123) // set seed for reproducibility.
64+
now := time.Now()
65+
subject.(*adaptiveSampler).start = now.Add(-time.Second)
66+
for i := 99; i < 199; i++ {
67+
if subject.Sample() {
68+
trueCount++
69+
}
70+
}
71+
72+
// sample rate should be 1/s
73+
if trueCount != 1 {
74+
t.Errorf("Unexpected trueCount: %d", trueCount)
75+
}
76+
77+
// start time should be set to now after rate adjustment.
78+
if subject.(*adaptiveSampler).start.Before(now) {
79+
t.Errorf("start time was not updated: %v >= $v", subject.(*adaptiveSampler).start, now)
80+
}
81+
82+
// simulate last 100 requests taking 10s.
83+
trueCount = 0
84+
subject.(*adaptiveSampler).start = time.Now().Add(-time.Second * 10)
85+
for i := 199; i < 299; i++ {
86+
if subject.Sample() {
87+
trueCount++
88+
}
89+
}
90+
91+
// sample rate should be 10/s
92+
if trueCount != 10 {
93+
t.Errorf("Unexpected trueCount: %d", trueCount)
94+
}
95+
96+
// simulate last 100 requests taking 100s.
97+
trueCount = 0
98+
subject.(*adaptiveSampler).start = time.Now().Add(-time.Second * 100)
99+
for i := 299; i < 399; i++ {
100+
if subject.Sample() {
101+
trueCount++
102+
}
103+
}
104+
105+
// sampler should max out and sample all requests.
106+
if trueCount != 100 {
107+
t.Errorf("Unexpected trueCount: %d", trueCount)
108+
}
109+
}

middleware/tracer.go

+112-26
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
package middleware
22

33
import (
4-
rd "math/rand"
5-
"net/http"
6-
74
"context"
5+
"net/http"
86

97
"github.com/goadesign/goa"
108
"github.com/goadesign/goa/client"
@@ -25,56 +23,144 @@ type (
2523
// tracing systems such as Zipkin or AWS X-Ray.
2624
IDFunc func() string
2725

26+
// TracerOption is a constructor option that makes it possible to customize
27+
// the middleware.
28+
TracerOption func(*tracerOptions) *tracerOptions
29+
30+
// tracerOptions is the struct storing all the options.
31+
tracerOptions struct {
32+
traceIDFunc IDFunc
33+
spanIDFunc IDFunc
34+
samplingPercent int
35+
maxSamplingRate int
36+
sampleSize int
37+
}
38+
2839
// tracedDoer is a goa client Doer that inserts the tracing headers for
2940
// each request it makes.
3041
tracedDoer struct {
3142
client.Doer
3243
}
3344
)
3445

35-
// Tracer returns a middleware that initializes the trace information in the
36-
// context. The information can be retrieved using any of the ContextXXX
37-
// functions.
46+
// TraceIDFunc is a constructor option that overrides the function used to
47+
// compute trace IDs.
48+
func TraceIDFunc(f IDFunc) TracerOption {
49+
return func(o *tracerOptions) *tracerOptions {
50+
if f == nil {
51+
panic("trace ID function cannot be nil")
52+
}
53+
o.traceIDFunc = f
54+
return o
55+
}
56+
}
57+
58+
// SpanIDFunc is a constructor option that overrides the function used to
59+
// compute span IDs.
60+
func SpanIDFunc(f IDFunc) TracerOption {
61+
return func(o *tracerOptions) *tracerOptions {
62+
if f == nil {
63+
panic("span ID function cannot be nil")
64+
}
65+
o.spanIDFunc = f
66+
return o
67+
}
68+
}
69+
70+
// SamplingPercent sets the tracing sampling rate as a percentage value.
71+
// It panics if p is less than 0 or more than 100.
72+
// SamplingPercent and MaxSamplingRate are mutually exclusive.
73+
func SamplingPercent(p int) TracerOption {
74+
if p < 0 || p > 100 {
75+
panic("sampling rate must be between 0 and 100")
76+
}
77+
return func(o *tracerOptions) *tracerOptions {
78+
o.samplingPercent = p
79+
return o
80+
}
81+
}
82+
83+
// MaxSamplingRate sets a target sampling rate in requests per second. Setting a
84+
// max sampling rate causes the middleware to adjust the sampling percent
85+
// dynamically.
86+
// SamplingPercent and MaxSamplingRate are mutually exclusive.
87+
func MaxSamplingRate(r int) TracerOption {
88+
if r <= 0 {
89+
panic("max sampling rate must be greater than 0")
90+
}
91+
return func(o *tracerOptions) *tracerOptions {
92+
o.maxSamplingRate = r
93+
return o
94+
}
95+
}
96+
97+
// SampleSize sets the number of requests between two adjustments of the sampling
98+
// rate when MaxSamplingRate is set. Defaults to 1,000.
99+
func SampleSize(s int) TracerOption {
100+
if s <= 0 {
101+
panic("sample size must be greater than 0")
102+
}
103+
return func(o *tracerOptions) *tracerOptions {
104+
o.sampleSize = s
105+
return o
106+
}
107+
}
108+
109+
// NewTracer returns a trace middleware that initializes the trace information
110+
// in the request context. The information can be retrieved using any of the
111+
// ContextXXX functions.
38112
//
39-
// sampleRate must be a value between 0 and 100. It represents the percentage of
40-
// requests that should be traced.
113+
// samplingPercent must be a value between 0 and 100. It represents the percentage
114+
// of requests that should be traced. If the incoming request has a Trace ID
115+
// header then the sampling rate is disregarded and the tracing is enabled.
41116
//
42117
// spanIDFunc and traceIDFunc are the functions used to create Span and Trace
43118
// IDs respectively. This is configurable so that the created IDs are compatible
44119
// with the various backend tracing systems. The xray package provides
45120
// implementations that produce AWS X-Ray compatible IDs.
46-
//
47-
// If the incoming request has a TraceIDHeader header then the sample rate is
48-
// disregarded and the tracing is enabled.
49-
func Tracer(sampleRate int, spanIDFunc, traceIDFunc IDFunc) goa.Middleware {
50-
if sampleRate < 0 || sampleRate > 100 {
51-
panic("tracing: sample rate must be between 0 and 100")
121+
func NewTracer(opts ...TracerOption) goa.Middleware {
122+
o := &tracerOptions{
123+
traceIDFunc: shortID,
124+
spanIDFunc: shortID,
125+
samplingPercent: 100,
126+
sampleSize: 1000, // only applies if maxSamplingRate is set
127+
}
128+
for _, opt := range opts {
129+
o = opt(o)
130+
}
131+
var sampler Sampler
132+
if o.maxSamplingRate > 0 {
133+
sampler = NewAdaptiveSampler(o.maxSamplingRate, o.sampleSize)
134+
} else {
135+
sampler = NewFixedSampler(o.samplingPercent)
52136
}
53137
return func(h goa.Handler) goa.Handler {
54138
return func(ctx context.Context, rw http.ResponseWriter, req *http.Request) error {
55-
// Compute trace info.
56-
var (
57-
traceID = req.Header.Get(TraceIDHeader)
58-
parentID = req.Header.Get(ParentSpanIDHeader)
59-
spanID = spanIDFunc()
60-
)
139+
// insert a new trace ID only if not already being traced.
140+
traceID := req.Header.Get(TraceIDHeader)
61141
if traceID == "" {
62-
// Avoid computing a random value if unnecessary.
63-
if sampleRate == 0 || rd.Intn(100) > sampleRate {
142+
// insert tracing only within sample.
143+
if sampler.Sample() {
144+
traceID = o.traceIDFunc()
145+
} else {
64146
return h(ctx, rw, req)
65147
}
66-
traceID = traceIDFunc()
67148
}
68149

69-
// Setup context.
150+
// insert IDs into context to enable tracing.
151+
spanID := o.spanIDFunc()
152+
parentID := req.Header.Get(ParentSpanIDHeader)
70153
ctx = WithTrace(ctx, traceID, spanID, parentID)
71-
72-
// Call next handler.
73154
return h(ctx, rw, req)
74155
}
75156
}
76157
}
77158

159+
// Tracer is deprecated in favor of NewTracer.
160+
func Tracer(sampleRate int, spanIDFunc, traceIDFunc IDFunc) goa.Middleware {
161+
return NewTracer(SamplingPercent(sampleRate), SpanIDFunc(spanIDFunc), TraceIDFunc(traceIDFunc))
162+
}
163+
78164
// TraceDoer wraps a goa client Doer and sets the trace headers so that the
79165
// downstream service may properly retrieve the parent span ID and trace ID.
80166
func TraceDoer(doer client.Doer) client.Doer {

0 commit comments

Comments
 (0)