forked from influxdata/influxdb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtx.go
546 lines (477 loc) · 16.5 KB
/
tx.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
package influxdb
import (
"fmt"
"math"
"sort"
"time"
"github.com/boltdb/bolt"
"github.com/influxdb/influxdb/influxql"
)
// tx represents a transaction that spans multiple shard data stores.
// This transaction will open and close all data stores atomically.
type tx struct {
server *Server
now time.Time
// used by DecodeFields and FieldIDs. Only used in a raw query, which won't let you select from more than one measurement
measurement *Measurement
decoder fieldDecoder
}
// newTx return a new initialized Tx.
func newTx(server *Server) *tx {
return &tx{
server: server,
now: time.Now(),
}
}
// SetNow sets the current time for the transaction.
func (tx *tx) SetNow(now time.Time) { tx.now = now }
// CreateMappers will create a set of mappers that need to be run to execute the map phase of a MapReduceJob.
func (tx *tx) CreateMapReduceJobs(stmt *influxql.SelectStatement, tagKeys []string) ([]*influxql.MapReduceJob, error) {
jobs := []*influxql.MapReduceJob{}
for _, src := range stmt.Sources {
mm, ok := src.(*influxql.Measurement)
if !ok {
return nil, fmt.Errorf("invalid source type: %#v", src)
}
// Find database and retention policy.
db := tx.server.databases[mm.Database]
if db == nil {
return nil, ErrDatabaseNotFound(mm.Database)
}
rp := db.policies[mm.RetentionPolicy]
if rp == nil {
return nil, ErrRetentionPolicyNotFound
}
// Find measurement.
m, err := tx.server.measurement(mm.Database, mm.Name)
if err != nil {
return nil, err
}
if m == nil {
return nil, ErrMeasurementNotFound(influxql.QuoteIdent([]string{mm.Database, "", mm.Name}...))
}
tx.measurement = m
tx.decoder = NewFieldCodec(m)
// Validate the fields and tags asked for exist and keep track of which are in the select vs the where
var selectFields []*Field
var whereFields []*Field
var selectTags []string
for _, n := range stmt.NamesInSelect() {
f := m.FieldByName(n)
if f != nil {
selectFields = append(selectFields, f)
continue
}
if !m.HasTagKey(n) {
return nil, fmt.Errorf("unknown field or tag name in select clause: %s", n)
}
selectTags = append(selectTags, n)
}
for _, n := range stmt.NamesInWhere() {
if n == "time" {
continue
}
f := m.FieldByName(n)
if f != nil {
whereFields = append(whereFields, f)
continue
}
if !m.HasTagKey(n) {
return nil, fmt.Errorf("unknown field or tag name in where clause: %s", n)
}
}
// If a numerical aggregate is requested, ensure it is only performed on numeric data.
for _, a := range stmt.FunctionCalls() {
lit, ok := a.Args[0].(*influxql.VarRef)
if !ok {
return nil, fmt.Errorf("aggregate call didn't contain a field %s", a.String())
}
if influxql.IsNumeric(a) {
f := m.FieldByName(lit.Val)
if f.Type != influxql.Float && f.Type != influxql.Integer {
return nil, fmt.Errorf("aggregate '%s' requires numerical field values. Field '%s' is of type %s",
a.Name, f.Name, f.Type)
}
}
}
// Grab time range from statement.
tmin, tmax := influxql.TimeRange(stmt.Condition)
if tmax.IsZero() {
tmax = tx.now
}
if tmin.IsZero() {
tmin = time.Unix(0, 0)
}
// Find shard groups within time range.
var shardGroups []*ShardGroup
for _, group := range rp.shardGroups {
if group.Contains(tmin, tmax) {
shardGroups = append(shardGroups, group)
}
}
if len(shardGroups) == 0 {
return nil, nil
}
// get the group by interval, if there is one
var interval int64
if d, err := stmt.GroupByInterval(); err != nil {
return nil, err
} else {
interval = d.Nanoseconds()
}
// get the sorted unique tag sets for this query.
tagSets, err := m.tagSets(stmt, tagKeys)
if err != nil {
return nil, err
}
//jobs := make([]*influxql.MapReduceJob, 0, len(tagSets))
for _, t := range tagSets {
// make a job for each tagset
job := &influxql.MapReduceJob{
MeasurementName: m.Name,
TagSet: t,
TMin: tmin.UnixNano(),
TMax: tmax.UnixNano(),
}
// make a mapper for each shard that must be hit. We may need to hit multiple shards within a shard group
var mappers []influxql.Mapper
// create mappers for each shard we need to hit
for _, sg := range shardGroups {
shards := map[*Shard][]uint64{}
for _, sid := range t.SeriesIDs {
shard := sg.ShardBySeriesID(sid)
shards[shard] = append(shards[shard], sid)
}
for shard, sids := range shards {
var mapper influxql.Mapper
// create either a remote or local mapper for this shard
if shard.store == nil {
nodes := tx.server.DataNodesByID(shard.DataNodeIDs)
if len(nodes) == 0 {
return nil, ErrShardNotFound
}
balancer := NewDataNodeBalancer(nodes)
mapper = &RemoteMapper{
dataNodes: balancer,
Database: mm.Database,
MeasurementName: m.Name,
TMin: tmin.UnixNano(),
TMax: tmax.UnixNano(),
SeriesIDs: sids,
ShardID: shard.ID,
WhereFields: whereFields,
SelectFields: selectFields,
SelectTags: selectTags,
Limit: stmt.Limit,
Offset: stmt.Offset,
Interval: interval,
}
mapper.(*RemoteMapper).SetFilters(t.Filters)
} else {
mapper = &LocalMapper{
seriesIDs: sids,
db: shard.store,
job: job,
decoder: NewFieldCodec(m),
filters: t.Filters,
whereFields: whereFields,
selectFields: selectFields,
selectTags: selectTags,
tmin: tmin.UnixNano(),
tmax: tmax.UnixNano(),
interval: interval,
// multiple mappers may need to be merged together to get the results
// for a raw query. So each mapper will have to read at least the
// limit plus the offset in data points to ensure we've hit our mark
limit: uint64(stmt.Limit) + uint64(stmt.Offset),
}
}
mappers = append(mappers, mapper)
}
}
job.Mappers = mappers
jobs = append(jobs, job)
}
}
// always return them in sorted order so the results from running the jobs are returned in a deterministic order
sort.Sort(influxql.MapReduceJobs(jobs))
return jobs, nil
}
// DecodeValues is for use in a raw data query
func (tx *tx) DecodeValues(fieldIDs []uint8, timestamp int64, data []byte) []interface{} {
vals := make([]interface{}, len(fieldIDs)+1)
vals[0] = timestamp
for i, id := range fieldIDs {
v, _ := tx.decoder.DecodeByID(id, data)
vals[i+1] = v
}
return vals
}
// FieldIDs will take an array of fields and return the id associated with each
func (tx *tx) FieldIDs(fields []*influxql.Field) ([]uint8, error) {
names := tx.fieldNames(fields)
ids := make([]uint8, len(names))
for i, n := range names {
field := tx.measurement.FieldByName(n)
if field == nil {
return nil, ErrFieldNotFound
}
ids[i] = field.ID
}
return ids, nil
}
// fieldNames returns the referenced database field names from the slice of fields
func (tx *tx) fieldNames(fields []*influxql.Field) []string {
var a []string
for _, f := range fields {
if v, ok := f.Expr.(*influxql.VarRef); ok { // this is a raw query so we handle it differently
a = append(a, v.Val)
}
}
return a
}
// LocalMapper implements the influxql.Mapper interface for running map tasks over a shard that is local to this server
type LocalMapper struct {
cursorsEmpty bool // boolean that lets us know if the cursors are empty
decoder fieldDecoder // decoder for the raw data bytes
filters []influxql.Expr // filters for each series
cursors []*bolt.Cursor // bolt cursors for each series id
seriesIDs []uint64 // seriesIDs to be read from this shard
db *bolt.DB // bolt store for the shard accessed by this mapper
txn *bolt.Tx // read transactions by shard id
job *influxql.MapReduceJob // the MRJob this mapper belongs to
mapFunc influxql.MapFunc // the map func
fieldID uint8 // the field ID associated with the mapFunc curently being run
fieldName string // the field name associated with the mapFunc currently being run
keyBuffer []int64 // the current timestamp key for each cursor
valueBuffer [][]byte // the current value for each cursor
tmin int64 // the min of the current group by interval being iterated over
tmax int64 // the max of the current group by interval being iterated over
additionalNames []string // additional field or tag names that might be requested from the map function
whereFields []*Field // field names that occur in the where clause
selectFields []*Field // field names that occur in the select clause
selectTags []string // tag keys that occur in the select clause
isRaw bool // if the query is a non-aggregate query
interval int64 // the group by interval of the query, if any
limit uint64 // used for raw queries for LIMIT
perIntervalLimit int // used for raw queries to determine how far into a chunk we are
chunkSize int // used for raw queries to determine how much data to read before flushing to client
}
// Open opens the LocalMapper.
func (l *LocalMapper) Open() error {
// Open the data store
txn, err := l.db.Begin(false)
if err != nil {
return err
}
l.txn = txn
// create a bolt cursor for each unique series id
l.cursors = make([]*bolt.Cursor, len(l.seriesIDs))
for i, id := range l.seriesIDs {
b := l.txn.Bucket(u64tob(id))
if b == nil {
continue
}
l.cursors[i] = b.Cursor()
}
return nil
}
// Close closes the LocalMapper.
func (l *LocalMapper) Close() {
_ = l.txn.Rollback()
}
// Begin will set up the mapper to run the map function for a given aggregate call starting at the passed in time
func (l *LocalMapper) Begin(c *influxql.Call, startingTime int64, chunkSize int) error {
// set up the buffers. These ensure that we return data in time order
mapFunc, err := influxql.InitializeMapFunc(c)
if err != nil {
return err
}
l.mapFunc = mapFunc
l.keyBuffer = make([]int64, len(l.cursors))
l.valueBuffer = make([][]byte, len(l.cursors))
l.chunkSize = chunkSize
l.tmin = startingTime
// determine if this is a raw data query with a single field, multiple fields, or an aggregate
var fieldName string
if c == nil { // its a raw data query
l.isRaw = true
if len(l.selectFields) == 1 {
fieldName = l.selectFields[0].Name
}
// if they haven't set a limit, just set it to the max int size
if l.limit == 0 {
l.limit = math.MaxUint64
}
} else {
lit, _ := c.Args[0].(*influxql.VarRef)
fieldName = lit.Val
}
// set up the field info if a specific field was set for this mapper
if fieldName != "" {
f := l.decoder.FieldByName(fieldName)
if f == nil {
return fmt.Errorf("%s isn't a field on measurement %s", fieldName, l.job.MeasurementName)
}
l.fieldID = f.ID
l.fieldName = f.Name
}
// seek the bolt cursors and fill the buffers
for i, c := range l.cursors {
// this series may have never been written in this shard group (time range) so the cursor would be nil
if c == nil {
l.keyBuffer[i] = 0
l.valueBuffer[i] = nil
continue
}
k, v := c.Seek(u64tob(uint64(l.job.TMin)))
if k == nil {
l.keyBuffer[i] = 0
l.valueBuffer[i] = nil
continue
}
l.cursorsEmpty = false
t := int64(btou64(k))
l.keyBuffer[i] = t
l.valueBuffer[i] = v
}
return nil
}
// NextInterval will get the time ordered next interval of the given interval size from the mapper. This is a
// forward only operation from the start time passed into Begin. Will return nil when there is no more data to be read.
// If this is a raw query, interval should be the max time to hit in the query
func (l *LocalMapper) NextInterval() (interface{}, error) {
if l.cursorsEmpty || l.tmin > l.job.TMax {
return nil, nil
}
// after we call to the mapper, this will be the tmin for the next interval.
nextMin := l.tmin + l.interval
// Set the upper bound of the interval.
if l.isRaw {
l.perIntervalLimit = l.chunkSize
} else if l.interval > 0 {
// Set tmax to ensure that the interval lands on the boundary of the interval
if l.tmin%l.interval != 0 {
// the first interval in a query with a group by may be smaller than the others. This happens when they have a
// where time > clause that is in the middle of the bucket that the group by time creates. That will be the
// case on the first interval when the tmin % the interval isn't equal to zero
nextMin = l.tmin/l.interval*l.interval + l.interval
}
l.tmax = nextMin - 1
}
// Execute the map function. This local mapper acts as the iterator
val := l.mapFunc(l)
// see if all the cursors are empty
l.cursorsEmpty = true
for _, k := range l.keyBuffer {
if k != 0 {
l.cursorsEmpty = false
break
}
}
// Move the interval forward if it's not a raw query. For raw queries we use the limit to advance intervals.
if !l.isRaw {
l.tmin = nextMin
}
return val, nil
}
// Next returns the next matching timestamped value for the LocalMapper.
func (l *LocalMapper) Next() (seriesID uint64, timestamp int64, value interface{}) {
for {
// if it's a raw query and we've hit the limit of the number of points to read in
// for either this chunk or for the absolute query, bail
if l.isRaw && (l.limit == 0 || l.perIntervalLimit == 0) {
return uint64(0), int64(0), nil
}
// find the minimum timestamp
min := -1
minKey := int64(math.MaxInt64)
for i, k := range l.keyBuffer {
if k != 0 && k <= l.tmax && k < minKey && k >= l.tmin {
min = i
minKey = k
}
}
// return if there is no more data in this group by interval
if min == -1 {
return 0, 0, nil
}
// set the current timestamp and seriesID
timestamp = l.keyBuffer[min]
seriesID = l.seriesIDs[min]
// decode either the value, or values we need. Also filter if necessary
var value interface{}
var err error
if l.isRaw && len(l.selectFields) > 1 {
if fieldsWithNames, err := l.decoder.DecodeFieldsWithNames(l.valueBuffer[min]); err == nil {
value = fieldsWithNames
// if there's a where clause, make sure we don't need to filter this value
if l.filters[min] != nil {
if !matchesWhere(l.filters[min], fieldsWithNames) {
value = nil
}
}
}
} else {
value, err = l.decoder.DecodeByID(l.fieldID, l.valueBuffer[min])
// if there's a where clase, see if we need to filter
if l.filters[min] != nil {
// see if the where is only on this field or on one or more other fields. if the latter, we'll have to decode everything
if len(l.whereFields) == 1 && l.whereFields[0].ID == l.fieldID {
if !matchesWhere(l.filters[min], map[string]interface{}{l.fieldName: value}) {
value = nil
}
} else { // decode everything
fieldsWithNames, err := l.decoder.DecodeFieldsWithNames(l.valueBuffer[min])
if err != nil || !matchesWhere(l.filters[min], fieldsWithNames) {
value = nil
}
}
}
}
// advance the cursor
nextKey, nextVal := l.cursors[min].Next()
if nextKey == nil {
l.keyBuffer[min] = 0
} else {
l.keyBuffer[min] = int64(btou64(nextKey))
}
l.valueBuffer[min] = nextVal
// if the value didn't match our filter or if we didn't find the field keep iterating
if err != nil || value == nil {
continue
}
// if it's a raw query, we always limit the amount we read in
if l.isRaw {
l.limit--
l.perIntervalLimit--
}
return seriesID, timestamp, value
}
}
// IsEmpty returns true if either all cursors are nil or all cursors are past the passed in max time
func (l *LocalMapper) IsEmpty(tmax int64) bool {
if l.cursorsEmpty || l.limit == 0 {
return true
}
// look at the next time for each cursor
for _, t := range l.keyBuffer {
// if the time is less than the max, we haven't emptied this mapper yet
if t != 0 && t <= tmax {
return false
}
}
return true
}
// matchesFilter returns true if the value matches the where clause
func matchesWhere(f influxql.Expr, fields map[string]interface{}) bool {
if ok, _ := influxql.Eval(f, fields).(bool); !ok {
return false
}
return true
}
type fieldDecoder interface {
DecodeByID(fieldID uint8, b []byte) (interface{}, error)
FieldByName(name string) *Field
DecodeFieldsWithNames(b []byte) (map[string]interface{}, error)
}