forked from influxdata/kapacitor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjoin.go
268 lines (249 loc) · 8.73 KB
/
join.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
package pipeline
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/influxdata/influxdb/influxql"
)
const (
defaultJoinDelimiter = "."
)
// Joins the data from any number of nodes.
// As each data point is received from a parent node it is paired
// with the next data points from the other parent nodes with a
// matching timestamp. Each parent node contributes at most one point
// to each joined point. A tolerance can be supplied to join points
// that do not have perfectly aligned timestamps.
// Any points that fall within the tolerance are joined on the timestamp.
// If multiple points fall within the same tolerance window than they are joined in the order
// they arrive.
//
// Aliases are used to prefix all fields from the respective nodes.
//
// The join can be an inner or outer join, see the JoinNode.Fill property.
//
// Example:
// var errors = stream
// |from()
// .measurement('errors')
// var requests = stream
// |from()
// .measurement('requests')
// // Join the errors and requests streams
// errors
// |join(requests)
// // Provide prefix names for the fields of the data points.
// .as('errors', 'requests')
// // points that are within 1 second are considered the same time.
// .tolerance(1s)
// // fill missing values with 0, implies outer join.
// .fill(0.0)
// // name the resulting stream
// .streamName('error_rate')
// // treat a delete from one side of the join as a delete to all sides
// .deleteAll(TRUE)
// // Both the "value" fields from each parent have been prefixed
// // with the respective names 'errors' and 'requests'.
// |eval(lambda: "errors.value" / "requests.value")
// .as('rate')
// ...
//
// In the above example the `errors` and `requests` streams are joined
// and then transformed to calculate a combined field.
type JoinNode struct {
chainnode `json:"-"`
// The alias names of the two parents.
// Note:
// Names[1] corresponds to the left parent
// Names[0] corresponds to the right parent
// tick:ignore
Names []string `tick:"As" json:"as"`
// The dimensions on which to join
// tick:ignore
Dimensions []string `tick:"On" json:"on"`
// The delimiter for the field name prefixes.
// Can be the empty string.
Delimiter string `json:"delimiter"`
// Deletes both sides of the join regardless what
// side receive the delete message.
DeleteAll bool `json:"deleteAll"`
// The name of this new joined data stream.
// If empty the name of the left parent is used.
StreamName string `json:"streamName"`
// The maximum duration of time that two incoming points
// can be apart and still be considered to be equal in time.
// The joined data point's time will be rounded to the nearest
// multiple of the tolerance duration.
Tolerance time.Duration `json:"tolerance"`
// Fill the data.
// The fill option implies the type of join: inner or full outer
// Options are:
//
// - none - (default) skip rows where a point is missing, inner join.
// - null - fill missing points with null, full outer join.
// - Any numerical value - fill fields with given value, full outer join.
//
// When using a numerical or null fill, the fields names are determined by copying
// the field names from another point.
// This doesn't work well when different sources have different field names.
// Use the DefaultNode and DeleteNode to finalize the fill operation if necessary.
//
// Example:
// var maintlock = stream
// |from()
// .measurement('maintlock')
// .groupBy('service')
// var requests = stream
// |from()
// .measurement('requests')
// .groupBy('service')
// // Join the maintlock and requests streams
// // The intent it to drop any points in maintenance mode.
// maintlock
// |join(requests)
// // Provide prefix names for the fields of the data points.
// .as('maintlock', 'requests')
// // points that are within 1 second are considered the same time.
// .tolerance(1s)
// // fill missing fields with null, implies outer join.
// // a better default per field will be set later.
// .fill('null')
// // name the resulting stream.
// .streamName('requests')
// |default()
// // default maintenance mode to false, overwriting the null value if present.
// .field('maintlock.mode', false)
// // default the requests to 0, again overwriting the null value if present.
// .field('requests.value', 0.0)
// // drop any points that are in maintenance mode.
// |where(lambda: "maintlock.mode")
// |...
Fill interface{} `json:"fill"`
}
func newJoinNode(e EdgeType, parents []Node) *JoinNode {
j := &JoinNode{
chainnode: newBasicChainNode("join", e, e),
Delimiter: defaultJoinDelimiter,
}
for _, n := range parents {
n.linkChild(j)
}
return j
}
// MarshalJSON converts JoinNode to JSON
// tick:ignore
func (n *JoinNode) MarshalJSON() ([]byte, error) {
type Alias JoinNode
var raw = &struct {
TypeOf
*Alias
Tolerance string `json:"tolerance"`
}{
TypeOf: TypeOf{
Type: "join",
ID: n.ID(),
},
Alias: (*Alias)(n),
Tolerance: influxql.FormatDuration(n.Tolerance),
}
return json.Marshal(raw)
}
// UnmarshalJSON converts JSON to an JoinNode
// tick:ignore
func (n *JoinNode) UnmarshalJSON(data []byte) error {
type Alias JoinNode
var raw = &struct {
TypeOf
*Alias
Tolerance string `json:"tolerance"`
}{
Alias: (*Alias)(n),
}
err := json.Unmarshal(data, raw)
if err != nil {
return err
}
if raw.Type != "join" {
return fmt.Errorf("error unmarshaling node %d of type %s as JoinNode", raw.ID, raw.Type)
}
n.Tolerance, err = influxql.ParseDuration(raw.Tolerance)
if err != nil {
return err
}
n.setID(raw.ID)
return nil
}
// Prefix names for all fields from the respective nodes.
// Each field from the parent nodes will be prefixed with the provided name and a '.'.
// See the example above.
//
// The names cannot have a dot '.' character.
//
// tick:property
func (j *JoinNode) As(names ...string) *JoinNode {
j.Names = names
return j
}
// Join on a subset of the group by dimensions.
// This is a special case where you want a single point from one parent to join with multiple
// points from a different parent.
//
// For example given two measurements:
//
// 1. building_power (a single value) -- tagged by building, value is the total power consumed by the building.
// 2. floor_power (multiple values) -- tagged by building and floor, values are the total power consumed by each floor.
//
// You want to calculate the percentage of the total building power consumed by each floor.
// Since you only have one point per building you need it to join multiple times with
// the points from each floor. By defining the `on` dimensions as `building` we are saying
// that we want points that only have the building tag to be joined with more specifc points that
// more tags, in this case the `floor` tag. In other words while we have points with tags building and floor
// we only want to join on the building tag.
//
// Example:
// var building = stream
// |from()
// .measurement('building_power')
// .groupBy('building')
// var floor = stream
// |from()
// .measurement('floor_power')
// .groupBy('building', 'floor')
// building
// |join(floor)
// .as('building', 'floor')
// .on('building')
// |eval(lambda: "floor.value" / "building.value")
// ... // Values here are grouped by 'building' and 'floor'
//
// tick:property
func (j *JoinNode) On(dims ...string) *JoinNode {
j.Dimensions = dims
return j
}
// Validate that the as() specification is consistent with the number of join arms.
func (j *JoinNode) validate() error {
if len(j.Names) == 0 {
return fmt.Errorf("a call to join.as() is required to specify the output stream prefixes.")
}
if len(j.Names) != len(j.Parents()) {
return fmt.Errorf("number of prefixes specified by join.as() must match the number of joined streams")
}
for _, name := range j.Names {
if len(name) == 0 {
return fmt.Errorf("must provide a prefix name for the join node, see .as() property method")
}
if j.Delimiter != "" && strings.Contains(name, j.Delimiter) {
return fmt.Errorf("cannot use name %s as field prefix, it contains the delimiter %q ", name, j.Delimiter)
}
}
names := make(map[string]bool, len(j.Names))
for _, name := range j.Names {
if names[name] {
return fmt.Errorf("cannot use the same prefix name see .as() property method")
}
names[name] = true
}
return nil
}