config.yaml.sample

# This is am example configuration file for Pivot, here you can add data sources as well as configure Pivot settings
# You can start by using this sample config by running `cp config_sample.yaml config.yaml`

# The port on which the Pivot server will listen on
port: 9090

# Run in verbose mode and print the queries sent to the server
#verbose: true

# A Druid broker node that can serve data (only used if you have Druid based data source)
druidHost: localhost:8082

# A timeout for the Druid queries in ms (default: 30000 = 30 seconds)
#timeout: 30000

# The data sources that you have configured, these will appear, in order, inside the navigation menu of Pivot
# In general there can be two types of 'engine':
#   - native: a JSON file that is crunched within plywood itself (useful for small datasets and testing)
#   - druid: a Druid dataSource
dataSources:

  # Here we have a data source based on a single day of Wikipedia
  - name: static-wiki # This will go into the URL so no fancy characters allowed

    # This is the title that will grace this data source in the the menus
    title: Static Wikipedia

    # Use the native engine, all calculations will be done in Node.JS. Good for up to 100k rows of data.
    engine: native

    # The file representing the datasource relative to repo root
    source: assets/data/wikiticker-2015-09-12-sampled.json
    # This datasource was scraped using https://github.com/implydata/wikiticker
    # GitHub does not like large files so only a sampled file is checked in
    # There is also a non-sampled file with the filter: isAnonymous == true applied, to use it set:
    # source: assets/data/wikiticker-2015-09-12-anonymous.json
    # Run `assets/data-raw/process-wikiticker-2015-09-12` to get the full example file

    # The subset filter that is applied to the data. This essentially restricts the data to only the rows that match the filter
    # This is useful if you are providing Pivot to people who should have restricted access. Must be an AND of INs (or just a single IN)
    #subsetFilter: $channel.in(["#en.wikipedia", "#de.wikipedia"])

    # The refresh rule describes how often the data source looks for new data. Default: 'query'/PT1M (every minute)
    # In this case it has to be fixed since this data source is static
    refreshRule:
      rule: fixed # also possible: 'query' and 'realtime'
      time: 2015-09-13T00:00:00Z

    # The default timezone for this dataset to operate in defaults to UTC
    #defaultTimezone: Asia/Kathmandu

    # The default duration for the time filter (if not set P3D is used)
    defaultDuration: P1D

    # The default filter in the UI, must be an AND of INs (or just a single IN)
    #defaultFilter: $channel.in(["#en.wikipedia", "#de.wikipedia"])

    # The default sort measure name (if not set the first measure name is used)
    defaultSortMeasure: delta

    # The names of dimensions that are pinned by default (in order that they will appear in the pin bar
    defaultPinnedDimensions: ["channel", "namespace", "isRobot"]

    # How the dataset should be introspected
    # possible options are:
    # * none - Do not do any introspection, take what is written in the config as the rule of law.
    # * no-autofill - Introspect the datasource but do not automatically generate dimensions or measures
    # * autofill-dimensions-only - Introspect the datasource, automatically generate dimensions only
    # * autofill-measures-only - Introspect the datasource, automatically generate measures only
    # * autofill-all - (default) Introspect the datasource, automatically generate dimensions and measures
    introspection: autofill-dimensions-only

    # The list of dimensions defined in the UI. The order here will be reflected in the UI
    dimensions:
      # A general dimension looks like so:
      # - name: channel
      #   ^ the name of the dimension as used in the URL (you should try not to change these)
      #
      #   title: The Channel
      #   ^ (optional) the human readable title. If not set a title is generated from the 'name'
      #
      #   kind: string
      #   ^ (optional) the kind of the dimension. Can be 'string', 'time', 'number', or 'boolean'. Defaults to 'string'
      #
      #   expression: $channel
      #   ^ (optional) the Plywood bucketing expression for this dimension. Defaults to '$name'
      #     if, say, channel was called 'cnl' in the data you would put '$cnl' here
      #     See also the expressions API reference: https://github.com/implydata/plywood/blob/master/docs/expressions.md

      - name: time
        kind: time

      - name: channel

      - name: cityName

      - name: comment

      - name: countryIso
        title: Country ISO
        expression: $countryIsoCode

      - name: countryName

      - name: isAnonymous

      - name: isMinor

      - name: isNew

      - name: isRobot

      - name: isUnpatrolled

      - name: metroCode

      - name: namespace

      - name: page

      - name: regionIso
        title: Region ISO
        expression: $regionIsoCode

      - name: regionName

      - name: user

    # The list of measures defined in the UI. The order here will be reflected in the UI
    measures:
      # A general measure looks like so:
      #
      # - name: avg_revenue
      #   ^ the name of the dimension as used in the URL (you should try not to change these)
      #
      #   title: Average Revenue
      #   ^ (optional) the human readable title. If not set a title is generated from the 'name'
      #
      #   expression: $main.average($revenue)
      #   ^ (optional) the Plywood bucketing expression for this dimension. Defaults to '$main.sum($name)'
      #     this is the place to define your fancy formulas

      - name: count
        title: Rows
        expression: $main.count()

      - name: delta

      - name: avg_delta
        expression: $main.average($delta)

      - name: added

      - name: avg_added
        expression: $main.average($added)

      - name: deleted

      - name: avg_deleted
        expression: $main.average($deleted)

      - name: unique_users
        title: Unique Users
        expression: $main.countDistinct($user)


  # Here is an example of a Druid data source, this one is taken from the Druid wikipedia demo
  # It will work for you if you have setup the demo Wikipedia Editstream scraper
  - name: wiki
    title: Wikipedia Edits
    engine: druid # Set the engine to druid
    source: wikipedia # The druid dataSource

    # This is a real time data source so always assume it is up to date
    # if this assumption is not true, better use 'query' instead
    refreshRule:
      rule: realtime

    # All the dimensions will be automatically filled in from the data source
    dimensions:
      # Here are some cool derived dimension examples that you might want to experimant with

      # Create a boolean predicate dimension
      - name: is-english
        expression: $language == 'en'

      # Extract a RegExp from a dimension, here we look for the first number in the user attribute
      - name: user-number
        expression: $user.extract("(\d+)")

      # Get a substring, there we take the first letter of the user's name
      - name: user-first-letter
        expression: $user.substr(0, 1)

      # Use a Druid query time lookup to transform a dimension with a lookup
      # Read more about Druid QTLs here: http://druid.io/docs/latest/querying/lookups.html
      - name: language
        expression: $language.lookup('wikipedia-language-lookup')

    measures:
      - name: count

      - name: avg_delta
        expression: $main.sum($delta) / $main.sum($count)

      - name: avg_added
        expression: $main.sum($added) / $main.sum($count)

      - name: avg_deleted
        expression: $main.sum($deleted) / $main.sum($count)

      - name: distinct_users
        expression: $main.countDistinct($user_unique)


  # Here is an example of a Druid data source with cool crazy things in it for education
  # in this example custom aggregations are passed directly into Druid, this is useful if
  # you have custom sketches or trying to do something that Plywood does not (yet) support.
  # If you use this for something other than a custom sketch I would appreciate it if you could
  # file an issue in Plywood (https://github.com/implydata/plywood).
  - name: wiki-crazy
    title: Wikipedia Crazy
    engine: druid
    source: wikipedia

    # Avoid creating automatic dimensions from the data
    introspection: no-autofill

    options:
      customAggregations:
        boring:
          #accessType  <-- this is how this aggregate will be accessed from a postAgg (default is 'fieldAccess')
          aggregation:
            type: longSum
            fieldName: added

        mod1337:
          aggregation:
            type: javascript
            fieldNames: ['added']
            fnAggregate: "function(current, added) { return (current + added) % 1337 }"
            fnCombine: "function(partialA, partialB) { return (partialA + partialB) % 1337 }"
            fnReset: "function() { return 0; }"

    dimensions:
      - name: time
        kind: time

      - name: namespace

      - name: language

      - name: page

      - name: user

      - name: is-english
        expression: $language == 'en'

      - name: user-number
        expression: $user.extract("(\d+)")

      - name: user-first-letter
        expression: $user.substr(0, 1)


    measures:
      - name: count

      - name: added

      - name: boring_added
        # Using the custom stuff defined above here
        expression: $main.custom(boring)

      - name: added1337
        expression: $main.custom(mod1337)

      - name: combined
        # Custom aggregates can be used in mathematical expressions
        expression: ($main.custom(boring) - $main.custom(mod1337)) / 1337