Skip to content

Commit

Permalink
Merge pull request zabbix#164 from SkySai1/main
Browse files Browse the repository at this point in the history
New template for gathering stats of NVidia GPU for Windows client
  • Loading branch information
oscar120584 authored Feb 16, 2023
2 parents 434a07e + 1426b3c commit 6409405
Show file tree
Hide file tree
Showing 3 changed files with 372 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# NVidia Sensors

## Overview

This template integrates NVidia SMI for a single graphics card with Zabbix for Windows Clients.


The template adds monitoring of:


* GPU Utilisation
* GPU Memory Utilisation
* GPU Power Consumption
* GPU Memory (Used, Free, Total)
* GPU Temperature
* GPU Temperature of machine's shutdown
* GPU Fan Speed

## Author of original

Richard Kavanagh

## Semi Author
Victor Krestov

## Macros used

There are no macros links in this template.

## Template links

There are no template links in this template.

## Discovery rules

There are no discovery rules in this template.

## Items collected

|Name|Description|Type|Key and additional info|
|----|-----------|----|----|
|GPU Power|<p>-</p>|`Zabbix agent`|gpu.power<p>Update: 30</p>|
|GPU Free Memory|<p>-</p>|`Zabbix agent`|gpu.free<p>Update: 30</p>|
|GPU Utilisation|<p>-</p>|`Zabbix agent`|gpu.utilisation<p>Update: 30</p>|
|GPU Memory Utilisation|<p>-</p>|`Zabbix agent`|gpu.memutil<p>Update: 30</p>|
|GPU Total Memory|<p>-</p>|`Zabbix agent`|gpu.memtotal<p>Update: 30</p>|
|GPU Temperature|<p>-</p>|`Zabbix agent`|gpu.temp<p>Update: 30</p>|
|GPU Shutdown Temperature|<p>-</p>|`Zabbix agent`|gpu.shutdowntemp<p>Update: 30</p>|
|GPU Used Memory|<p>-</p>|`Zabbix agent`|gpu.used<p>Update: 30</p>|
|GPU Fan Speed|<p>-</p>|`Zabbix agent`|gpu.fanspeed<p>Update: 30</p>|


## Triggers

|Name|Description|Expression|Priority|
|----|-----------|----------|--------|
|GPU Temperature over 95c {HOSTNAME}|<p>-</p>|<p>**Expression**: last(/NVidia Sensors/gpu.temp,#2)>95
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
UserParameter=gpu.temp[*],nvidia-smi -q --gpu=0 |findstr /c:"GPU Current Temp"
UserParameter=gpu.shutdowntemp[*],nvidia-smi -q --gpu=0 |findstr /c:"GPU Shutdown Temp"
UserParameter=gpu.memtotal[*],nvidia-smi -q --gpu=0 |findstr Total
UserParameter=gpu.used[*],nvidia-smi -q --gpu=0 |findstr Used
UserParameter=gpu.free[*],nvidia-smi -q --gpu=0 |findstr Free
UserParameter=gpu.fanspeed[*],nvidia-smi -q --gpu=0 |findstr Fan
UserParameter=gpu.power[*],nvidia-smi -q --gpu=0 |findstr /c:"Power Limit"
UserParameter=gpu.utilisation[*],nvidia-smi -q --gpu=0 |findstr /c:"Gpu"
UserParameter=gpu.memutil[*],nvidia-smi -q --gpu=0 |findstr /c:"Memory"|findstr /c:"%"
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
zabbix_export:
version: '6.2'
date: '2022-11-05T18:19:10Z'
template_groups:
-
uuid: 7df96b18c230490a9a0a9e2307226338
name: Templates
templates:
-
uuid: a021d16dab6741fdb0852d16df32b1f1
template: 'NVidia Sensors'
name: 'NVidia Sensors'
description: |
## Overview
This template integrates NVidia SMI for a single graphics card with Zabbix.
The template adds monitoring of:
* GPU Utilisation
* GPU Power Consumption
* GPU Memory (Used, Free, Total)
* GPU Temperature
* GPU Fan Speed
The following agent parameters can be used to add the metrics into Zabbix.
UserParameter=gpu.temp,nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i 0
UserParameter=gpu.memtotal,nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i 0
UserParameter=gpu.used,nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0
UserParameter=gpu.free,nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i 0
UserParameter=gpu.fanspeed,nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i 0
UserParameter=gpu.utilisation,nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i 0
UserParameter=gpu.power,nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i 0
## Author
Richard Kavanagh
groups:
-
name: Templates
items:
-
uuid: 1859c0a8b79346ecb9da26f04a939a55
name: 'GPU Fan Speed'
key: gpu.fanspeed
delay: '30'
value_type: FLOAT
units: '%'
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
-
uuid: 14bff6576d754b21a5acfc42b1403507
name: 'GPU Free Memory'
key: gpu.free
delay: '30'
value_type: FLOAT
units: B
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
-
type: MULTIPLIER
parameters:
- '1048576'
tags:
-
tag: Application
value: Nvidia
-
uuid: 11b6b2d40525492a872ab36a99b01d18
name: 'GPU Total Memory'
key: gpu.memtotal
delay: '30'
value_type: FLOAT
units: B
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
-
type: MULTIPLIER
parameters:
- '1048576'
tags:
-
tag: Application
value: Nvidia
-
uuid: 044e5cd338574ccda40b373095154336
name: 'GPU Memory Utilisation'
key: gpu.memutil
delay: '30'
value_type: FLOAT
units: '%'
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
-
uuid: 58a0c7ab4a9a4760b3a3fd5825969fd7
name: 'GPU Power'
key: gpu.power
delay: '30'
value_type: FLOAT
units: W
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
-
uuid: d52c4fa0f15b47ef837e43bd82cdfb7e
name: 'GPU Shutdown Temperature'
key: gpu.shutdowntemp
delay: '30'
value_type: FLOAT
units: C
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
-
uuid: ceea4ab8fbda48cfaad3590d4b19e115
name: 'GPU Temperature'
key: gpu.temp
delay: '30'
value_type: FLOAT
units: C
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
triggers:
-
uuid: d18b5023d7d14b6490406f5b402197c4
expression: 'last(/NVidia Sensors/gpu.temp,#2)>95'
name: 'GPU Temperature over 95c {HOSTNAME}'
priority: AVERAGE
-
uuid: 4544799689e44836b975d433ec3c5ef4
name: 'GPU Used Memory'
key: gpu.used
delay: '30'
value_type: FLOAT
units: B
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
-
type: MULTIPLIER
parameters:
- '1048576'
tags:
-
tag: Application
value: Nvidia
-
uuid: 1ae3048cdefd47839d1f76674777f280
name: 'GPU Utilisation'
key: gpu.utilisation
delay: '30'
value_type: FLOAT
units: '%'
preprocessing:
-
type: JAVASCRIPT
parameters:
- |
regexp = /[0-9]+/g;
var f = Number(regexp.exec(value)[0]);
return f;
tags:
-
tag: Application
value: Nvidia
graphs:
-
uuid: 1fdc4e107c4d4707b193443c6e16ea68
name: 'GPU Memory'
graph_items:
-
color: C80000
item:
host: 'NVidia Sensors'
key: gpu.free
-
sortorder: '1'
color: 00C800
item:
host: 'NVidia Sensors'
key: gpu.memtotal
-
sortorder: '2'
color: 0000C8
item:
host: 'NVidia Sensors'
key: gpu.used
-
uuid: 387c249ac6834aa0b9edcf3bb70daa16
name: 'GPU Power'
graph_items:
-
color: C80000
item:
host: 'NVidia Sensors'
key: gpu.power
-
uuid: 0d8822621c85420fb6ff847e92312c4f
name: 'GPU Temperature'
graph_items:
-
color: C80000
item:
host: 'NVidia Sensors'
key: gpu.temp
-
sortorder: '1'
color: 0000EE
yaxisside: RIGHT
item:
host: 'NVidia Sensors'
key: gpu.fanspeed
-
uuid: 82e6484d122e4e63a2b6cd6c8345b42d
name: 'GPU Utilisation'
graph_items:
-
color: C80000
item:
host: 'NVidia Sensors'
key: gpu.utilisation
-
sortorder: '1'
color: 33FF33
yaxisside: RIGHT
item:
host: 'NVidia Sensors'
key: gpu.power

0 comments on commit 6409405

Please sign in to comment.