Skip to content

Commit da930ae

Browse files
committed
observability: conditionally publish Go-runtime Prometheus metrics
* at startup: register lightweight Go runtime collector wrapper * introduce Enable-Go-Runtime-Metrics feature flag: - scope: cluster - when enabled: call Prometheus Go collector ---- * part two, prev. commit: 7e08a84 "register subset of go runtime metrics" Signed-off-by: Alex Aizman <alex.aizman@gmail.com>
1 parent c04abe1 commit da930ae

8 files changed

Lines changed: 78 additions & 37 deletions

File tree

‎cmd/cli/cli/feat.go‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ var clusterFeatDesc = [...]string{
5050
"do not delete unrecognized/invalid FQNs during space cleanup ('ais space-cleanup')",
5151
"when bucket is n-way mirrored read object replica from the least-utilized mountpath",
5252
"count GET(object) 404 as errors (default: don't)",
53+
"publish selected Go runtime metrics via Prometheus",
5354

5455
// apc.ResetToken ("none") ===========
5556
}
@@ -81,6 +82,7 @@ var featTags = map[string]string{
8182
"Keep-Unknown-FQN": "integrity?,ops",
8283
"Load-Balance-GET": "perf",
8384
"Count-Object-NotFound-Stats": "telemetry,ops",
85+
"Enable-Go-Runtime-Metrics": "telemetry,ops,overhead",
8486
}
8587

8688
// common (cluster, bucket) feature-flags (set, show) helper

‎cmd/cli/go.mod‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli
33
go 1.26
44

55
require (
6-
github.com/NVIDIA/aistore v1.4.6
6+
github.com/NVIDIA/aistore v1.4.7-0.20260603160611-79a7e658c646
77
github.com/fatih/color v1.19.0
88
github.com/json-iterator/go v1.1.12
99
github.com/onsi/ginkgo/v2 v2.28.1

‎cmd/cli/go.sum‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
22
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
33
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
4-
github.com/NVIDIA/aistore v1.4.6 h1:k7W8NTDdp2/LREYIO1hxgJw3Enyj+tZ8IVSyCwIG/Vk=
5-
github.com/NVIDIA/aistore v1.4.6/go.mod h1:gQVg+bfLzebJFLhHTQ3cqyLI1EdVm32LzF1f1ETDYgc=
4+
github.com/NVIDIA/aistore v1.4.7-0.20260603160611-79a7e658c646 h1:qaUyyh2bERa1925tWOGkUkTl+FKdAPDKHH9qYYNTcW4=
5+
github.com/NVIDIA/aistore v1.4.7-0.20260603160611-79a7e658c646/go.mod h1:gQVg+bfLzebJFLhHTQ3cqyLI1EdVm32LzF1f1ETDYgc=
66
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
77
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
88
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=

‎cmn/feat/feat.go‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const (
5050
KeepUnknownFQN // do not delete unrecognized/invalid FQNs during space cleanup ('ais space-cleanup')
5151
LoadBalanceGET // when bucket is n-way mirrored read object replica from the least-utilized mountpath
5252
CountObjectNotFoundStats // count GET(object) 404 (not-found) as errors (default: don't); TODO: add Prometheus to count HEAD(object) errors
53+
EnableGoRuntimeMetrics // publish selected Go runtime metrics via Prometheus
5354
)
5455

5556
var Cluster = [...]string{
@@ -78,6 +79,7 @@ var Cluster = [...]string{
7879
"Keep-Unknown-FQN",
7980
"Load-Balance-GET",
8081
"Count-Object-NotFound-Stats",
82+
"Enable-Go-Runtime-Metrics",
8183

8284
// apc.ResetToken ("none") ===========
8385
}

‎stats/api.go‎

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,9 @@ const (
3737
KindThroughput = "bw" // ditto (MB/s)
3838
)
3939

40-
// static labels
40+
// common static labels (see also: stats/go_runtime)
4141
const (
42-
ConstlabNode = "node_id"
43-
ConstlabComponent = "component"
42+
ConstlabNode = "node_id"
4443
)
4544

4645
// variable labels

‎stats/common.go‎

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,6 @@ func (r *runner) RegExtMetric(snode *meta.Snode, name, kind string, extra *Extra
188188
func (r *runner) regCommon(snode *meta.Snode) {
189189
initProm(snode)
190190

191-
regGoRuntime(snode.Type())
192-
193191
// basic counters
194192
r.reg(snode, GetCount, KindCounter,
195193
&Extra{

‎stats/common_prom.go‎

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ package stats
77

88
import (
99
"net/http"
10-
"regexp"
1110
"strings"
1211
ratomic "sync/atomic"
1312
"time"
@@ -17,7 +16,6 @@ import (
1716
"github.com/NVIDIA/aistore/memsys"
1817

1918
"github.com/prometheus/client_golang/prometheus"
20-
"github.com/prometheus/client_golang/prometheus/collectors"
2119
"github.com/prometheus/client_golang/prometheus/promhttp"
2220
)
2321

@@ -47,38 +45,14 @@ var (
4745
staticLabs = prometheus.Labels{ConstlabNode: ""}
4846
)
4947

48+
// 1) initialize Prometheus registry without default Go/process collectors
49+
// 2) register Go runtime collector - a no-op unless feat.EnableGoRuntimeMetrics
5050
func initProm(snode *meta.Snode) {
5151
promRegistry = prometheus.NewRegistry()
5252

5353
staticLabs[ConstlabNode] = strings.ReplaceAll(snode.ID(), ".", "_")
54-
}
5554

56-
// Expose a low-cardinality subset of Go runtime metrics on the AIS Prometheus registry:
57-
// - base collector (unconditional):
58-
// go_goroutines, go_threads, go_info, go_gc_duration_seconds, go_memstats_last_gc_time_seconds
59-
// - default runtime/metrics matcher:
60-
// go_gc_gogc_percent, go_gc_gomemlimit_bytes, go_sched_gomaxprocs_threads
61-
// - explicit picks via WithGoCollectorRuntimeMetrics below (one series each)
62-
func regGoRuntime(daeType string) {
63-
nodeLabs := prometheus.Labels{
64-
ConstlabNode: staticLabs[ConstlabNode],
65-
ConstlabComponent: daeType,
66-
}
67-
reg := prometheus.WrapRegistererWith(nodeLabs, promRegistry)
68-
reg.MustRegister(collectors.NewGoCollector(
69-
// skip legacy go_memstats_* block (21 series, mostly duplicative); use runtime/metrics names instead
70-
collectors.WithGoCollectorMemStatsMetricsDisabled(),
71-
collectors.WithGoCollectorRuntimeMetrics(
72-
collectors.GoRuntimeMetricsRule{Matcher: regexp.MustCompile(
73-
`^/memory/classes/heap/objects:bytes$` + // heap in use
74-
`|^/memory/classes/total:bytes$` + // total from OS
75-
`|^/gc/heap/goal:bytes$` + // next GC target
76-
`|^/gc/heap/allocs:bytes$` + // cumulative alloc bytes (counter)
77-
`|^/cpu/classes/gc/total:cpu-seconds$` + // cumulative GC CPU time
78-
`|^/gc/cycles/total:gc-cycles$`, // cumulative GC cycle count
79-
)},
80-
),
81-
))
55+
regGoRuntime(snode.Type())
8256
}
8357

8458
// usage: log resulting `copyValue` numbers:

‎stats/go_runtime.go‎

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Package stats provides methods and functionality to register, track, log,
2+
// and export metrics that, for the most part, include "counter" and "latency" kinds.
3+
/*
4+
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
5+
*/
6+
package stats
7+
8+
import (
9+
"regexp"
10+
11+
"github.com/NVIDIA/aistore/cmn"
12+
"github.com/NVIDIA/aistore/cmn/feat"
13+
14+
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/client_golang/prometheus/collectors"
16+
)
17+
18+
// additional static label for Go runtime
19+
const (
20+
ConstlabNodeType = "node_type"
21+
)
22+
23+
type goRuntimeCollector struct {
24+
collector prometheus.Collector
25+
}
26+
27+
var regexGoMetrics = regexp.MustCompile(
28+
`^/memory/classes/heap/objects:bytes$` + // heap in use
29+
`|^/memory/classes/total:bytes$` + // total from OS
30+
`|^/gc/heap/goal:bytes$` + // next GC target
31+
`|^/gc/heap/allocs:bytes$` + // cumulative alloc bytes
32+
`|^/cpu/classes/gc/total:cpu-seconds$`, // cumulative GC CPU time
33+
)
34+
35+
func regGoRuntime(nodeType string) {
36+
reg := prometheus.WrapRegistererWith(
37+
prometheus.Labels{
38+
ConstlabNode: staticLabs[ConstlabNode],
39+
ConstlabNodeType: nodeType,
40+
},
41+
promRegistry,
42+
)
43+
reg.MustRegister(newGoRuntimeCollector())
44+
}
45+
46+
func newGoRuntimeCollector() *goRuntimeCollector {
47+
return &goRuntimeCollector{
48+
collector: collectors.NewGoCollector(
49+
// skip go_memstats_* block; expose selected runtime/metrics instead.
50+
collectors.WithGoCollectorMemStatsMetricsDisabled(),
51+
collectors.WithGoCollectorRuntimeMetrics(
52+
collectors.GoRuntimeMetricsRule{Matcher: regexGoMetrics},
53+
),
54+
),
55+
}
56+
}
57+
58+
func (c *goRuntimeCollector) Describe(ch chan<- *prometheus.Desc) {
59+
c.collector.Describe(ch)
60+
}
61+
62+
func (c *goRuntimeCollector) Collect(ch chan<- prometheus.Metric) {
63+
if cmn.Rom.Features().IsSet(feat.EnableGoRuntimeMetrics) {
64+
c.collector.Collect(ch)
65+
}
66+
}

0 commit comments

Comments
 (0)