forked from cheddar/pivot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml.sample
281 lines (205 loc) · 9.65 KB
/
config.yaml.sample
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# This is am example configuration file for Pivot, here you can add data sources as well as configure Pivot settings
# You can start by using this sample config by running `cp config_sample.yaml config.yaml`
# The port on which the Pivot server will listen on
port: 9090
# Run in verbose mode and print the queries sent to the server
#verbose: true
# A Druid broker node that can serve data (only used if you have Druid based data source)
druidHost: localhost:8082
# A timeout for the Druid queries in ms (default: 30000 = 30 seconds)
#timeout: 30000
# The data sources that you have configured, these will appear, in order, inside the navigation menu of Pivot
# In general there can be two types of 'engine':
# - native: a JSON file that is crunched within plywood itself (useful for small datasets and testing)
# - druid: a Druid dataSource
dataSources:
# Here we have a data source based on a single day of Wikipedia
- name: static-wiki # This will go into the URL so no fancy characters allowed
# This is the title that will grace this data source in the the menus
title: Static Wikipedia
# Use the native engine, all calculations will be done in Node.JS. Good for up to 100k rows of data.
engine: native
# The file representing the datasource relative to repo root
source: assets/data/wikiticker-2015-09-12-sampled.json
# This datasource was scraped using https://github.com/implydata/wikiticker
# GitHub does not like large files so only a sampled file is checked in
# There is also a non-sampled file with the filter: isAnonymous == true applied, to use it set:
# source: assets/data/wikiticker-2015-09-12-anonymous.json
# Run `assets/data-raw/process-wikiticker-2015-09-12` to get the full example file
# The subset filter that is applied to the data. This essentially restricts the data to only the rows that match the filter
# This is useful if you are providing Pivot to people who should have restricted access. Must be an AND of INs (or just a single IN)
#subsetFilter: $channel.in(["#en.wikipedia", "#de.wikipedia"])
# The refresh rule describes how often the data source looks for new data. Default: 'query'/PT1M (every minute)
# In this case it has to be fixed since this data source is static
refreshRule:
rule: fixed # also possible: 'query' and 'realtime'
time: 2015-09-13T00:00:00Z
# The default timezone for this dataset to operate in defaults to UTC
#defaultTimezone: Asia/Kathmandu
# The default duration for the time filter (if not set P3D is used)
defaultDuration: P1D
# The default filter in the UI, must be an AND of INs (or just a single IN)
#defaultFilter: $channel.in(["#en.wikipedia", "#de.wikipedia"])
# The default sort measure name (if not set the first measure name is used)
defaultSortMeasure: delta
# The names of dimensions that are pinned by default (in order that they will appear in the pin bar
defaultPinnedDimensions: ["channel", "namespace", "isRobot"]
# How the dataset should be introspected
# possible options are:
# * none - Do not do any introspection, take what is written in the config as the rule of law.
# * no-autofill - Introspect the datasource but do not automatically generate dimensions or measures
# * autofill-dimensions-only - Introspect the datasource, automatically generate dimensions only
# * autofill-measures-only - Introspect the datasource, automatically generate measures only
# * autofill-all - (default) Introspect the datasource, automatically generate dimensions and measures
introspection: autofill-dimensions-only
# The list of dimensions defined in the UI. The order here will be reflected in the UI
dimensions:
# A general dimension looks like so:
# - name: channel
# ^ the name of the dimension as used in the URL (you should try not to change these)
#
# title: The Channel
# ^ (optional) the human readable title. If not set a title is generated from the 'name'
#
# kind: string
# ^ (optional) the kind of the dimension. Can be 'string', 'time', 'number', or 'boolean'. Defaults to 'string'
#
# expression: $channel
# ^ (optional) the Plywood bucketing expression for this dimension. Defaults to '$name'
# if, say, channel was called 'cnl' in the data you would put '$cnl' here
# See also the expressions API reference: https://github.com/implydata/plywood/blob/master/docs/expressions.md
- name: time
kind: time
- name: channel
- name: cityName
- name: comment
- name: countryIso
title: Country ISO
expression: $countryIsoCode
- name: countryName
- name: isAnonymous
- name: isMinor
- name: isNew
- name: isRobot
- name: isUnpatrolled
- name: metroCode
- name: namespace
- name: page
- name: regionIso
title: Region ISO
expression: $regionIsoCode
- name: regionName
- name: user
# The list of measures defined in the UI. The order here will be reflected in the UI
measures:
# A general measure looks like so:
#
# - name: avg_revenue
# ^ the name of the dimension as used in the URL (you should try not to change these)
#
# title: Average Revenue
# ^ (optional) the human readable title. If not set a title is generated from the 'name'
#
# expression: $main.average($revenue)
# ^ (optional) the Plywood bucketing expression for this dimension. Defaults to '$main.sum($name)'
# this is the place to define your fancy formulas
- name: count
title: Rows
expression: $main.count()
- name: delta
- name: avg_delta
expression: $main.average($delta)
- name: added
- name: avg_added
expression: $main.average($added)
- name: deleted
- name: avg_deleted
expression: $main.average($deleted)
- name: unique_users
title: Unique Users
expression: $main.countDistinct($user)
# Here is an example of a Druid data source, this one is taken from the Druid wikipedia demo
# It will work for you if you have setup the demo Wikipedia Editstream scraper
- name: wiki
title: Wikipedia Edits
engine: druid # Set the engine to druid
source: wikipedia # The druid dataSource
# This is a real time data source so always assume it is up to date
# if this assumption is not true, better use 'query' instead
refreshRule:
rule: realtime
# All the dimensions will be automatically filled in from the data source
dimensions:
# Here are some cool derived dimension examples that you might want to experimant with
# Create a boolean predicate dimension
- name: is-english
expression: $language == 'en'
# Extract a RegExp from a dimension, here we look for the first number in the user attribute
- name: user-number
expression: $user.extract("(\d+)")
# Get a substring, there we take the first letter of the user's name
- name: user-first-letter
expression: $user.substr(0, 1)
# Use a Druid query time lookup to transform a dimension with a lookup
# Read more about Druid QTLs here: http://druid.io/docs/latest/querying/lookups.html
- name: language
expression: $language.lookup('wikipedia-language-lookup')
measures:
- name: count
- name: avg_delta
expression: $main.sum($delta) / $main.sum($count)
- name: avg_added
expression: $main.sum($added) / $main.sum($count)
- name: avg_deleted
expression: $main.sum($deleted) / $main.sum($count)
- name: distinct_users
expression: $main.countDistinct($user_unique)
# Here is an example of a Druid data source with cool crazy things in it for education
# in this example custom aggregations are passed directly into Druid, this is useful if
# you have custom sketches or trying to do something that Plywood does not (yet) support.
# If you use this for something other than a custom sketch I would appreciate it if you could
# file an issue in Plywood (https://github.com/implydata/plywood).
- name: wiki-crazy
title: Wikipedia Crazy
engine: druid
source: wikipedia
# Avoid creating automatic dimensions from the data
introspection: no-autofill
options:
customAggregations:
boring:
#accessType <-- this is how this aggregate will be accessed from a postAgg (default is 'fieldAccess')
aggregation:
type: longSum
fieldName: added
mod1337:
aggregation:
type: javascript
fieldNames: ['added']
fnAggregate: "function(current, added) { return (current + added) % 1337 }"
fnCombine: "function(partialA, partialB) { return (partialA + partialB) % 1337 }"
fnReset: "function() { return 0; }"
dimensions:
- name: time
kind: time
- name: namespace
- name: language
- name: page
- name: user
- name: is-english
expression: $language == 'en'
- name: user-number
expression: $user.extract("(\d+)")
- name: user-first-letter
expression: $user.substr(0, 1)
measures:
- name: count
- name: added
- name: boring_added
# Using the custom stuff defined above here
expression: $main.custom(boring)
- name: added1337
expression: $main.custom(mod1337)
- name: combined
# Custom aggregates can be used in mathematical expressions
expression: ($main.custom(boring) - $main.custom(mod1337)) / 1337