Skip to content

Commit

Permalink
Update prometheus-cpp and Metric API unit tests (triton-inference-ser…
Browse files Browse the repository at this point in the history
…ver#115)

* Add more cases to metric api unit test

* Add more metrics API unit tests for duplicate metric family / metric entries and update label hasher for latest prometheus version

* Add MetricFamily::Unregister(), add ref count tests to unit tests

* cleanup

* Gracefully handle out of order deletion of MetricFamily before Metric

* Add check for specific error message on out-of-order deletion

* Remove unregistration logic, add unit test that fetches a core metric

* Cleanup

* cleanup

* Add more documentation on metric/mf delete and labels ownership

* Add lock on child_metrics.erase, add helper for metric api calls in unit test

* Invalidate metric on metric family delete, add unit test for calling metric APIs after MetricFamilyDelete

* cleanup errors

* Add check on MetricFamilyDelete to enforce ordering, add test using triton::core implementation to force deletion and test use of Metric after MetricFamilyDelete

* Add note on errors in Metric/MF delete

* Undo import removal

* Test tracking of child metrics in family, add MetricFamily::NumMetrics()
  • Loading branch information
rmccorm4 authored Aug 18, 2022
1 parent 0830a7d commit b49d790
Show file tree
Hide file tree
Showing 8 changed files with 661 additions and 145 deletions.
11 changes: 10 additions & 1 deletion include/triton/core/tritonserver.h
Original file line number Diff line number Diff line change
Expand Up @@ -2267,6 +2267,9 @@ TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
const char* name, const char* description);

/// Delete a metric family object.
/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
/// corresponding TRITONSERVER_Metric* objects have been deleted.
/// Attempting to delete a family before its metrics will return an error.
///
/// \param family The metric family object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
Expand All @@ -2275,7 +2278,10 @@ TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyDelete(

/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object.
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in. Each label can be deleted
/// immediately after creating the metric with TRITONSERVER_ParameterDelete
/// if not re-using the labels.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
Expand All @@ -2287,6 +2293,9 @@ TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricNew(
const TRITONSERVER_Parameter** labels, const uint64_t label_count);

/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
/// If a family is deleted before its metrics, an error will be returned.
///
/// \param metric The metric object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
Expand Down
98 changes: 80 additions & 18 deletions src/metric_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,62 +62,74 @@ MetricFamily::MetricFamily(
}

void*
MetricFamily::Add(std::map<std::string, std::string> label_map)
MetricFamily::Add(std::map<std::string, std::string> label_map, Metric* metric)
{
void* metric = nullptr;
void* prom_metric = nullptr;
switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
auto counter_ptr = &counter_family_ptr->Add(label_map);
metric = reinterpret_cast<void*>(counter_ptr);
prom_metric = reinterpret_cast<void*>(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
auto gauge_ptr = &gauge_family_ptr->Add(label_map);
metric = reinterpret_cast<void*>(gauge_ptr);
prom_metric = reinterpret_cast<void*>(gauge_ptr);
break;
}
default:
throw std::invalid_argument(
"Unsupported family kind passed to Metric constructor.");
}

std::lock_guard<std::mutex> lk(mtx_);
++metric_ref_cnt_[metric];
return metric;
std::lock_guard<std::mutex> lk(metric_mtx_);
++prom_metric_ref_cnt_[prom_metric];
child_metrics_.insert(metric);
return prom_metric;
}

void
MetricFamily::Remove(void* metric)
MetricFamily::Remove(void* prom_metric, Metric* metric)
{
{
std::lock_guard<std::mutex> lk(mtx_);
const auto it = metric_ref_cnt_.find(metric);
if (it != metric_ref_cnt_.end()) {
// Remove reference to dependent Metric object
std::lock_guard<std::mutex> lk(metric_mtx_);
child_metrics_.erase(metric);
}

if (prom_metric == nullptr) {
return;
}

{
std::lock_guard<std::mutex> lk(metric_mtx_);
const auto it = prom_metric_ref_cnt_.find(prom_metric);
if (it != prom_metric_ref_cnt_.end()) {
--it->second;
if (it->second == 0) {
metric_ref_cnt_.erase(it);
prom_metric_ref_cnt_.erase(it);
} else {
// Done as it is not the last reference
return;
}
}
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
auto counter_ptr = reinterpret_cast<prometheus::Counter*>(metric);
auto counter_ptr = reinterpret_cast<prometheus::Counter*>(prom_metric);
counter_family_ptr->Remove(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(metric);
auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(prom_metric);
gauge_family_ptr->Remove(gauge_ptr);
break;
}
Expand All @@ -128,10 +140,27 @@ MetricFamily::Remove(void* metric)
}
}

void
MetricFamily::InvalidateReferences()
{
std::lock_guard<std::mutex> lk(metric_mtx_);
for (auto& metric : child_metrics_) {
if (metric != nullptr) {
metric->Invalidate();
}
}
child_metrics_.clear();
}

MetricFamily::~MetricFamily()
{
// NOTE: registry->Remove() not added until until prometheus-cpp v1.0 which
// we do not currently install
if (NumMetrics() > 0) {
LOG_WARNING << "MetricFamily was deleted before its child Metrics, this "
"should not happen. Make sure to delete all child Metrics "
"before deleting their MetricFamily.";
}
InvalidateReferences();
// DLIS-4072: Support for removing metric families from registry
}

//
Expand All @@ -158,17 +187,38 @@ Metric::Metric(
std::string(reinterpret_cast<const char*>(param->ValuePointer()));
}

metric_ = family_->Add(label_map);
metric_ = family_->Add(label_map, this);
}

Metric::~Metric()
{
family_->Remove(metric_);
if (family_ != nullptr) {
family_->Remove(metric_, this);
} else {
LOG_WARNING << "Corresponding MetricFamily was deleted before this Metric, "
"this should not happen. Make sure to delete a Metric "
"before deleting its MetricFamily.";
}
// Catch lifetime management / invalid reference issues
Invalidate();
}

void
Metric::Invalidate()
{
family_ = nullptr;
metric_ = nullptr;
}

TRITONSERVER_Error*
Metric::Value(double* value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not get metric value. Metric has been invalidated.");
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
auto counter_ptr = reinterpret_cast<prometheus::Counter*>(metric_);
Expand Down Expand Up @@ -196,6 +246,12 @@ Metric::Value(double* value)
TRITONSERVER_Error*
Metric::Increment(double value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not increment metric value. Metric has been invalidated.");
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
if (value < 0.0) {
Expand Down Expand Up @@ -234,6 +290,12 @@ Metric::Increment(double value)
TRITONSERVER_Error*
Metric::Set(double value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not set metric value. Metric has been invalidated.");
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
return TRITONSERVER_ErrorNew(
Expand Down
32 changes: 26 additions & 6 deletions src/metric_family.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#ifdef TRITON_ENABLE_METRICS

#include <mutex>
#include <set>
#include <unordered_map>

#include "infer_parameter.h"
Expand All @@ -39,6 +40,7 @@ namespace triton { namespace core {
//
// Implementation for TRITONSERVER_MetricFamily.
//
class Metric;
class MetricFamily {
public:
MetricFamily(
Expand All @@ -48,19 +50,33 @@ class MetricFamily {
void* Family() const { return family_; }
TRITONSERVER_MetricKind Kind() const { return kind_; }

void* Add(std::map<std::string, std::string> label_map);
void Remove(void* metric);
void* Add(std::map<std::string, std::string> label_map, Metric* metric);
void Remove(void* prom_metric, Metric* metric);

int NumMetrics()
{
std::lock_guard<std::mutex> lk(metric_mtx_);
return child_metrics_.size();
}

private:
// If a MetricFamily is deleted before its dependent Metric, we want to
// invalidate the reference so we don't access invalid memory.
void InvalidateReferences();

void* family_;
std::mutex mtx_;
// prometheus returns the existing metric pointer if the metric with the same
TRITONSERVER_MetricKind kind_;
// Synchronize access of related metric objects
std::mutex metric_mtx_;
// Prometheus returns the existing metric pointer if the metric with the same
// set of labels are requested, as a result, different Metric objects may
// refer to the same prometheus metric. So we must track the reference count
// of the metric and request prometheus to remove it only when all references
// are released.
std::unordered_map<void*, size_t> metric_ref_cnt_;
TRITONSERVER_MetricKind kind_;
std::unordered_map<void*, size_t> prom_metric_ref_cnt_;
// Maintain references to metrics created from this metric family to
// invalidate their references if a family is deleted before its metric
std::set<Metric*> child_metrics_;
};

//
Expand All @@ -80,6 +96,10 @@ class Metric {
TRITONSERVER_Error* Increment(double value);
TRITONSERVER_Error* Set(double value);

// If a MetricFamily is deleted before its dependent Metric, we want to
// invalidate the references so we don't access invalid memory.
void Invalidate();

private:
void* metric_;
MetricFamily* family_;
Expand Down
4 changes: 3 additions & 1 deletion src/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,12 @@ Metrics::Metrics()
{
}

static prometheus::detail::LabelHasher label_hasher_;

size_t
Metrics::HashLabels(const std::map<std::string, std::string>& labels)
{
return prometheus::detail::hash_labels(labels);
return label_hasher_(labels);
}

Metrics::~Metrics()
Expand Down
2 changes: 2 additions & 0 deletions src/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include <atomic>
#include <mutex>
#include <thread>
#include "prometheus/counter.h"
#include "prometheus/gauge.h"
#include "prometheus/registry.h"
#include "prometheus/serializer.h"
#include "prometheus/text_serializer.h"
Expand Down
7 changes: 7 additions & 0 deletions src/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,12 @@ if(${TRITON_ENABLE_METRICS})
add_executable(
metrics_api_test
metrics_api_test.cc
../metric_family.cc
../metric_family.h
../metrics.cc
../metrics.h
../infer_parameter.cc
../infer_parameter.h
)

set_target_properties(
Expand Down Expand Up @@ -465,6 +471,7 @@ if(${TRITON_ENABLE_METRICS})
PRIVATE
triton-common-error # from repo-common
triton-common-logging # from repo-common
proto-library # from repo-common
triton-core
GTest::gtest
GTest::gtest_main
Expand Down
Loading

0 comments on commit b49d790

Please sign in to comment.