diff --git a/go.mod b/go.mod index 674311c8b9..2f2fb98f40 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/kubernetes-csi/external-snapshotter/client/v8 v8.0.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.19.1 + github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.55.0 github.com/prometheus/procfs v0.19.2 github.com/sirupsen/logrus v1.9.4 @@ -80,7 +81,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_model v0.6.1 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/tjfoc/gmsm v1.4.1 // indirect github.com/x448/float16 v0.8.4 // indirect diff --git a/pkg/metric/collector.go b/pkg/metric/collector.go index 6088dc57b1..d73068faa0 100644 --- a/pkg/metric/collector.go +++ b/pkg/metric/collector.go @@ -38,6 +38,7 @@ func registerCollector(collector string, factory collectorFactoryFunc, relatedDr type Collector interface { // Get new metrics and expose them via prometheus registry. Update(ch chan<- prometheus.Metric) error + Get() []*Metric } // CSICollector implements the prometheus.Collector interface. @@ -45,8 +46,7 @@ type CSICollector struct { Collectors map[string]Collector } -// newCSICollector method returns the CSICollector object -func newCSICollector(driverNames []string, serviceType utils.ServiceType) { +func initCSICollector(driverNames []string, serviceType utils.ServiceType) { if csiCollectorInstance != nil { return } @@ -67,7 +67,7 @@ func newCSICollector(driverNames []string, serviceType utils.ServiceType) { if enabled { collector, err := reg.Factory() if err != nil { - klog.ErrorS(err, "Failed to create collector") + klog.ErrorS(err, "Failed to create collector", "name", reg.Name) } else { collectors[reg.Name] = collector } @@ -98,6 +98,17 @@ func (csi CSICollector) Collect(ch chan<- prometheus.Metric) { wg.Wait() } +func GetMetrics(driverNames []string, serviceType utils.ServiceType) []*Metric { + if csiCollectorInstance == nil { + initCSICollector(driverNames, serviceType) + } + var metrics []*Metric + for _, c := range csiCollectorInstance.Collectors { + metrics = append(metrics, c.Get()...) + } + return metrics +} + func execute(name string, c Collector, ch chan<- prometheus.Metric) { defer func() { if err := recover(); err != nil { diff --git a/pkg/metric/common.go b/pkg/metric/common.go index 3af6d98e71..a7f33649d7 100644 --- a/pkg/metric/common.go +++ b/pkg/metric/common.go @@ -27,8 +27,6 @@ const ( //diskDriverName represents the csi storage type name of Disk diskDriverName string = "diskplugin.csi.alibabacloud.com" localDriverName string = "localplugin.csi.alibabacloud.com" - // unknown metric value - UnknownValue string = "unknown" ) const ( @@ -41,7 +39,6 @@ const ( diskSectorSize = 512 diskDefaultsLatencyThreshold = 10 diskDefaultsCapacityPercentageThreshold = 85 - nfsDefaultsCapacityPercentageThreshold = 85 nfsStatsFileName = "/proc/self/mountstats" latencyTooHigh = "LatencyTooHigh" capacityNotEnough = "NotEnoughDiskSpace" @@ -69,14 +66,7 @@ var ( ) type typedFactorDesc struct { - desc *prometheus.Desc + *MetaDesc valueType prometheus.ValueType factor float64 } - -func (d *typedFactorDesc) mustNewConstMetric(value float64, labels ...string) prometheus.Metric { - if d.factor != 0 { - value *= d.factor - } - return prometheus.MustNewConstMetric(d.desc, d.valueType, value, labels...) -} diff --git a/pkg/metric/csi_grpc_exec_time_collector.go b/pkg/metric/csi_grpc_exec_time_collector.go index cd97417645..8d95d89ad6 100644 --- a/pkg/metric/csi_grpc_exec_time_collector.go +++ b/pkg/metric/csi_grpc_exec_time_collector.go @@ -21,18 +21,28 @@ type csiGrpcExecTimeCollector struct { ExecTimeTotalMetric *prometheus.CounterVec } +var ( + execCountName = "execution_count" + execCountFQName = prometheus.BuildFQName(csiNamespace, grpcSubsystem, execCountName) + execCountHelp = "CSI grpc execution count." + + execTimeName = "execution_time_total" + execTimeFQName = prometheus.BuildFQName(csiNamespace, grpcSubsystem, execTimeName) + execTimeHelp = "CSI grpc execution time in total." +) + var CsiGrpcExecTimeCollector = csiGrpcExecTimeCollector{ ExecCountMetric: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: csiNamespace, Subsystem: grpcSubsystem, - Name: "execution_count", - Help: "CSI grpc execution count.", + Name: execCountName, + Help: execCountHelp, }, csiGrpcExecTimeLabels), ExecTimeTotalMetric: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: csiNamespace, Subsystem: grpcSubsystem, - Name: "execution_time_total", - Help: "CSI grpc execution time in total.", + Name: execTimeName, + Help: execTimeHelp, }, csiGrpcExecTimeLabels), } @@ -44,9 +54,17 @@ func GetCsiGrpcExecTimeCollector() (Collector, error) { return &CsiGrpcExecTimeCollector, nil } +func (c *csiGrpcExecTimeCollector) Get() []*Metric { + countMetrics := extractMetricsFromMetricVec(execCountFQName, execCountHelp, c.ExecCountMetric, prometheus.CounterValue) + timeMetrics := extractMetricsFromMetricVec(execTimeFQName, execTimeHelp, c.ExecTimeTotalMetric, prometheus.CounterValue) + return append(countMetrics, timeMetrics...) +} + func (c *csiGrpcExecTimeCollector) Update(ch chan<- prometheus.Metric) error { - c.ExecCountMetric.Collect(ch) - c.ExecTimeTotalMetric.Collect(ch) + metrics := c.Get() + for _, metric := range metrics { + ch <- prometheus.MustNewConstMetric(metric.Desc, metric.ValueType, metric.Value, convertLabelsToString(metric.VariableLabelPairs)...) + } return nil } diff --git a/pkg/metric/disk_stat_collector.go b/pkg/metric/disk_stat_collector.go index de4b85c522..96d52607fe 100644 --- a/pkg/metric/disk_stat_collector.go +++ b/pkg/metric/disk_stat_collector.go @@ -13,6 +13,7 @@ import ( "github.com/container-storage-interface/spec/lib/go/csi" "github.com/prometheus/client_golang/prometheus" + promdto "github.com/prometheus/client_model/go" "github.com/prometheus/procfs/blockdevice" "golang.org/x/sys/unix" v1 "k8s.io/api/core/v1" @@ -22,7 +23,6 @@ import ( "k8s.io/klog/v2" "k8s.io/mount-utils" - "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/options" "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/utils" ) @@ -32,17 +32,22 @@ var ( scalerPvcMap *sync.Map ) -func diskMetricDesc(name, help string) *prometheus.Desc { - return prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, name), - help, - diskStatLabelNames, diskStatConstLabels, - ) +func diskMetricDesc(name, help string) *MetaDesc { + return NewMetaDesc(nodeNamespace, volumeSubsystem, name, help, diskStatLabelNames, diskStatConstLabels) +} + +func convertLabelsToString(labelPairs []*promdto.LabelPair) []string { + result := make([]string, len(labelPairs)) + for i, pair := range labelPairs { + result[i] = *pair.Value + } + return result } // stats from /proc/diskstats var ( - diskReadsCompletedDesc = diskMetricDesc("read_completed_total", "The total number of reads completed successfully.") + diskReadsCompletedDesc = diskMetricDesc("read_completed_total", "The total number of reads completed successfully.") + diskReadsMergeDesc = diskMetricDesc("read_merged_total", "The total number of reads merged.") diskReadBytesDesc = diskMetricDesc("read_bytes_total", "The total number of bytes read successfully.") diskReadTimeMilliSecondsDesc = diskMetricDesc("read_time_milliseconds_total", "The total number of milliseconds spent by all reads.") @@ -57,9 +62,9 @@ var ( ) type capDescs struct { - Available *prometheus.Desc - Total *prometheus.Desc - Used *prometheus.Desc + Available *MetaDesc + Total *MetaDesc + Used *MetaDesc } var capStatDescs = map[csi.VolumeUsage_Unit]capDescs{ @@ -89,7 +94,7 @@ type diskStatCollector struct { lastPvDiskInfoMap map[string]diskInfo lastPvStatsMap atomic.Pointer[map[uint64]*blockdevice.Diskstats] diskStats *ProcDiskStats - clientSet *kubernetes.Clientset + client kubernetes.Interface recorder record.EventRecorder mounter mount.Interface nodeName string @@ -120,12 +125,7 @@ func getDiskCapacityThreshold() float64 { // NewDiskStatCollector returns a new Collector exposing disk stats. func NewDiskStatCollector() (Collector, error) { recorder := utils.NewEventRecorder() - config, err := options.GetRestConfig() - if err != nil { - return nil, err - } - // creates the clientset - clientset, err := kubernetes.NewForConfig(config) + client, err := newK8sClient() if err != nil { return nil, err } @@ -140,7 +140,7 @@ func NewDiskStatCollector() (Collector, error) { return &diskStatCollector{ lastPvDiskInfoMap: make(map[string]diskInfo, 0), diskStats: diskStats, - clientSet: clientset, + client: client, milliSecondsLatencyThreshold: getDiskLatencyThreshold(), capacityPercentageThreshold: getDiskCapacityThreshold(), recorder: recorder, @@ -149,17 +149,17 @@ func NewDiskStatCollector() (Collector, error) { }, nil } -func (p *diskStatCollector) Update(ch chan<- prometheus.Metric) error { - //startTime := time.Now() +func (p *diskStatCollector) Get() (metrics []*Metric) { volJSONPaths, err := findVolJSON(podsRootPath) if err != nil { - return err + return } p.updateMap(&p.lastPvDiskInfoMap, volJSONPaths, diskDriverName) diskStats, err := p.diskStats.GetStats() if err != nil { - return fmt.Errorf("couldn't get diskstats: %s", err) + klog.Errorf("couldn't get diskstats: %s", err) + return } deviceNameStatsMap := make(map[uint64]*blockdevice.Diskstats, len(diskStats)) for i, s := range diskStats { @@ -189,8 +189,8 @@ func (p *diskStatCollector) Update(ch chan<- prometheus.Metric) error { continue } devPath := "/dev/" + stats.DeviceName - labels := []string{info.PVCRef.Namespace, info.PVCRef.Name, devPath} - p.sendDiskStats(&stats.IOStats, labels, ch) + labelValues := []string{info.PVCRef.Namespace, info.PVCRef.Name, devPath} + metrics = append(metrics, p.getDiskStatMetrics(&stats.IOStats, labelValues...)...) if lastStats, ok := lastStatsMap[info.Dev]; ok { p.latencyEventAlert(&stats.IOStats, &lastStats.IOStats, info.PVCRef) } @@ -199,16 +199,23 @@ func (p *diskStatCollector) Update(ch chan<- prometheus.Metric) error { pvName, p.nodeName, info.DiskID, devPath) } - capStats, err := getDiskCapacityMetric(info.DiskID) + capStats, err := getDiskCapacityStats(info.DiskID) if err != nil { klog.ErrorS(err, "Get disk capacity failed", "disk", info.DiskID, err) continue } - p.sendCapStats(capStats, labels, ch) + metrics = append(metrics, p.getDiskCapacityMetrics(capStats, labelValues)...) p.capacityEventAlert(capStats, info.PVCRef) } - //elapsedTime := time.Since(startTime) - //logrus.Info("DiskStat spent time:", elapsedTime) + + return metrics +} + +func (p *diskStatCollector) Update(ch chan<- prometheus.Metric) error { + metrics := p.Get() + for _, metric := range metrics { + ch <- prometheus.MustNewConstMetric(metric.Desc, metric.ValueType, metric.Value, convertLabelsToString(metric.VariableLabelPairs)...) + } return nil } @@ -247,28 +254,29 @@ func (p *diskStatCollector) capacityEventAlert(usage []*csi.VolumeUsage, ref *v1 } } -func (p *diskStatCollector) sendDiskStats(stats *blockdevice.IOStats, labels []string, ch chan<- prometheus.Metric) { - ch <- prometheus.MustNewConstMetric(diskReadsCompletedDesc, prometheus.CounterValue, float64(stats.ReadIOs), labels...) - ch <- prometheus.MustNewConstMetric(diskReadsMergeDesc, prometheus.CounterValue, float64(stats.ReadMerges), labels...) - ch <- prometheus.MustNewConstMetric(diskReadBytesDesc, prometheus.CounterValue, float64(stats.ReadSectors)*diskSectorSize, labels...) - ch <- prometheus.MustNewConstMetric(diskReadTimeMilliSecondsDesc, prometheus.CounterValue, float64(stats.ReadTicks), labels...) - - ch <- prometheus.MustNewConstMetric(diskWritesCompletedDesc, prometheus.CounterValue, float64(stats.WriteIOs), labels...) - ch <- prometheus.MustNewConstMetric(diskWriteMergeDesc, prometheus.CounterValue, float64(stats.WriteMerges), labels...) - ch <- prometheus.MustNewConstMetric(diskWrittenBytesDesc, prometheus.CounterValue, float64(stats.WriteSectors)*diskSectorSize, labels...) - ch <- prometheus.MustNewConstMetric(diskWriteTimeMilliSecondsDesc, prometheus.CounterValue, float64(stats.WriteTicks), labels...) - - ch <- prometheus.MustNewConstMetric(diskIONowDesc, prometheus.GaugeValue, float64(stats.IOsInProgress), labels...) - ch <- prometheus.MustNewConstMetric(diskIOTimeSecondsDesc, prometheus.CounterValue, float64(stats.IOsTotalTicks)/1000, labels...) +func (p *diskStatCollector) getDiskStatMetrics(stats *blockdevice.IOStats, labelValues ...string) []*Metric { + return []*Metric{ + MustNewMetricWithMetaDesc(diskReadsCompletedDesc, float64(stats.ReadIOs), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskReadsMergeDesc, float64(stats.ReadMerges), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskReadBytesDesc, float64(stats.ReadSectors)*diskSectorSize, prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskReadTimeMilliSecondsDesc, float64(stats.ReadTicks), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskWritesCompletedDesc, float64(stats.WriteIOs), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskWriteMergeDesc, float64(stats.WriteMerges), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskWrittenBytesDesc, float64(stats.WriteSectors)*diskSectorSize, prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskWriteTimeMilliSecondsDesc, float64(stats.WriteTicks), prometheus.CounterValue, labelValues...), + MustNewMetricWithMetaDesc(diskIONowDesc, float64(stats.IOsInProgress), prometheus.GaugeValue, labelValues...), + MustNewMetricWithMetaDesc(diskIOTimeSecondsDesc, float64(stats.IOsTotalTicks)/1000, prometheus.CounterValue, labelValues...), + } } -func (p *diskStatCollector) sendCapStats(stats []*csi.VolumeUsage, labels []string, ch chan<- prometheus.Metric) { +func (p *diskStatCollector) getDiskCapacityMetrics(stats []*csi.VolumeUsage, labelValues []string) (metrics []*Metric) { for _, stat := range stats { descs := capStatDescs[stat.Unit] - ch <- prometheus.MustNewConstMetric(descs.Available, prometheus.GaugeValue, float64(stat.Available), labels...) - ch <- prometheus.MustNewConstMetric(descs.Total, prometheus.GaugeValue, float64(stat.Total), labels...) - ch <- prometheus.MustNewConstMetric(descs.Used, prometheus.GaugeValue, float64(stat.Used), labels...) + metrics = append(metrics, MustNewMetricWithMetaDesc(descs.Available, float64(stat.Available), prometheus.GaugeValue, labelValues...)) + metrics = append(metrics, MustNewMetricWithMetaDesc(descs.Total, float64(stat.Total), prometheus.GaugeValue, labelValues...)) + metrics = append(metrics, MustNewMetricWithMetaDesc(descs.Used, float64(stat.Used), prometheus.GaugeValue, labelValues...)) } + return } func (p *diskStatCollector) updateMap(lastPvDiskInfoMap *map[string]diskInfo, jsonPaths []string, driverName string) { @@ -321,7 +329,7 @@ func (p *diskStatCollector) updateDiskInfoMap(thisPvDiskInfoMap map[string]diskI lastInfo, ok := (*lastPvDiskInfoMap)[pv] // add and modify if !ok || thisInfo.VolDataPath != lastInfo.VolDataPath { - pvcRef, err := getDiskPvcByPvName(p.clientSet, pv) + pvcRef, err := getDiskPvcByPvName(p.client, pv, thisInfo.VolDataPath) if err != nil { continue } @@ -343,7 +351,7 @@ func (p *diskStatCollector) updateDiskInfoMap(thisPvDiskInfoMap map[string]diskI } } -func getDiskCapacityMetric(diskID string) ([]*csi.VolumeUsage, error) { +func getDiskCapacityStats(diskID string) ([]*csi.VolumeUsage, error) { globalMountPath := getGlobalMountPathByDiskID(diskID) response, err := utils.GetMetrics(globalMountPath) if err != nil { diff --git a/pkg/metric/fuse_stat_collector.go b/pkg/metric/fuse_stat_collector.go index 15ec49352d..6bec8770ce 100644 --- a/pkg/metric/fuse_stat_collector.go +++ b/pkg/metric/fuse_stat_collector.go @@ -30,300 +30,72 @@ var ( "exit_reason"} ) +func fuseMetricDesc(name, help string) *MetaDesc { + return NewMetaDesc(nodeNamespace, volumeSubsystem, name, help, usFsStatLabelNames, nil) +} + var ( - capacityBytesUsedCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_used_counter"), - ".", - usFsStatLabelNames, nil, - ) - capacityBytesAvailableCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_available_counter"), - ".", - usFsStatLabelNames, nil, - ) - capacityBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - inodeBytesUsedCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "inode_bytes_used_counter"), - ".", - usFsStatLabelNames, nil, - ) - inodeBytesAvailableCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "inode_bytes_available_counter"), - ".", - usFsStatLabelNames, nil, - ) - inodeBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "inode_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - readBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - writeBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - readCompletedTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_completed_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - writeCompletedTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_completed_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - readTimeMillisecondsTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_time_milliseconds_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - writeTimeMillisecondsTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_time_milliseconds_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixMkdirTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_mkdir_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixRmdirTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_rmdir_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixOpendirTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_opendir_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixReaddirTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_readdir_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixWriteTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_write_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixFlushTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_flush_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixFsyncTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_fsync_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixReleaseTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_release_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixReadTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_read_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixCreateTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_create_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixOpenTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_open_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixAccessTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_access_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixRenameTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_rename_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixChownTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_chown_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixChmodTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_chmod_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - posixTruncateTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "posix_truncate_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - ossPutObjectTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "oss_put_object_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - ossGetObjectTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "oss_get_object_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - ossHeadObjectTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "oss_head_object_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - ossDeleteObjectTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "oss_delete_object_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - ossPostObjectTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "oss_post_object_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - hotSpotReadFileTopDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "hot_spot_read_file_top"), - ".", - usFsStatLabelNames, nil, - ) - hotSpotWriteFileTopDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "hot_spot_write_file_top"), - ".", - usFsStatLabelNames, nil, - ) - hotSpotHeadFileTopDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "hot_spot_head_file_top"), - ".", - usFsStatLabelNames, nil, - ) + capacityBytesUsedCounterDesc = fuseMetricDesc("capacity_bytes_used_counter", ".") + capacityBytesAvailableCounterDesc = fuseMetricDesc("capacity_bytes_available_counter", ".") + capacityBytesTotalCounterDesc = fuseMetricDesc("capacity_bytes_total_counter", ".") + inodeBytesUsedCounterDesc = fuseMetricDesc("inode_bytes_used_counter", ".") + inodeBytesAvailableCounterDesc = fuseMetricDesc("inode_bytes_available_counter", ".") + inodeBytesTotalCounterDesc = fuseMetricDesc("inode_bytes_total_counter", ".") + readBytesTotalCounterDesc = fuseMetricDesc("read_bytes_total_counter", ".") + writeBytesTotalCounterDesc = fuseMetricDesc("write_bytes_total_counter", ".") + readCompletedTotalCounterDesc = fuseMetricDesc("read_completed_total_counter", ".") + writeCompletedTotalCounterDesc = fuseMetricDesc("write_completed_total_counter", ".") + readTimeMillisecondsTotalCounterDesc = fuseMetricDesc("read_time_milliseconds_total_counter", ".") + writeTimeMillisecondsTotalCounterDesc = fuseMetricDesc("write_time_milliseconds_total_counter", ".") + posixMkdirTotalCounterDesc = fuseMetricDesc("posix_mkdir_total_counter", ".") + posixRmdirTotalCounterDesc = fuseMetricDesc("posix_rmdir_total_counter", ".") + posixOpendirTotalCounterDesc = fuseMetricDesc("posix_opendir_total_counter", ".") + posixReaddirTotalCounterDesc = fuseMetricDesc("posix_readdir_total_counter", ".") + posixWriteTotalCounterDesc = fuseMetricDesc("posix_write_total_counter", ".") + posixFlushTotalCounterDesc = fuseMetricDesc("posix_flush_total_counter", ".") + posixFsyncTotalCounterDesc = fuseMetricDesc("posix_fsync_total_counter", ".") + posixReleaseTotalCounterDesc = fuseMetricDesc("posix_release_total_counter", ".") + posixReadTotalCounterDesc = fuseMetricDesc("posix_read_total_counter", ".") + posixCreateTotalCounterDesc = fuseMetricDesc("posix_create_total_counter", ".") + posixOpenTotalCounterDesc = fuseMetricDesc("posix_open_total_counter", ".") + posixAccessTotalCounterDesc = fuseMetricDesc("posix_access_total_counter", ".") + posixRenameTotalCounterDesc = fuseMetricDesc("posix_rename_total_counter", ".") + posixChownTotalCounterDesc = fuseMetricDesc("posix_chown_total_counter", ".") + posixChmodTotalCounterDesc = fuseMetricDesc("posix_chmod_total_counter", ".") + posixTruncateTotalCounterDesc = fuseMetricDesc("posix_truncate_total_counter", ".") + ossPutObjectTotalCounterDesc = fuseMetricDesc("oss_put_object_total_counter", ".") + ossGetObjectTotalCounterDesc = fuseMetricDesc("oss_get_object_total_counter", ".") + ossHeadObjectTotalCounterDesc = fuseMetricDesc("oss_head_object_total_counter", ".") + ossDeleteObjectTotalCounterDesc = fuseMetricDesc("oss_delete_object_total_counter", ".") + ossPostObjectTotalCounterDesc = fuseMetricDesc("oss_post_object_total_counter", ".") + hotSpotReadFileTopDesc = fuseMetricDesc("hot_spot_read_file_top", ".") + hotSpotWriteFileTopDesc = fuseMetricDesc("hot_spot_write_file_top", ".") + hotSpotHeadFileTopDesc = fuseMetricDesc("hot_spot_head_file_top", ".") ) var ( - backendReadBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_read_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendWriteBytesTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_write_bytes_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendReadCompletedTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_read_completed_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendWriteCompletedTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_write_completed_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendReadTimeMillisecondsTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_read_time_milliseconds_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendWriteTimeMillisecondsTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_write_time_milliseconds_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixGetAttrTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_getattr_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixGetModeTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_getmode_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixAccessTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_access_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixLookupTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_lookup_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixMknodTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_mknod_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixRemoveTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_remove_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixSetAttrTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_setattr_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixLinkTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_link_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixReadLinkTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_readlink_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixStatfsTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_statfs_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixRenameTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_rename_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - backendPosixReaddirTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "backend_posix_readdir_total_counter"), - ".", - usFsStatLabelNames, nil, - ) - mountRetryTotalCounterDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, utils.MetricsMountRetryCount), - ".", - usFsStatLabelNames, nil, - ) - mountPointStatusDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, utils.MetricsMountPointStatus), - ".", - usFsStatLabelNames, nil, - ) - mountPointFailoverTotalCountDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, utils.MetricsMountPointFailoverCount), - ".", - usFsStatLabelNames, nil, - ) - lastFuseClientExitReasonDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, utils.MetricsLastFuseClientExitReason), - ".", - usFsStatLabelNames, nil, - ) + backendReadBytesTotalCounterDesc = fuseMetricDesc("backend_read_bytes_total_counter", ".") + backendWriteBytesTotalCounterDesc = fuseMetricDesc("backend_write_bytes_total_counter", ".") + backendReadCompletedTotalCounterDesc = fuseMetricDesc("backend_read_completed_total_counter", ".") + backendWriteCompletedTotalCounterDesc = fuseMetricDesc("backend_write_completed_total_counter", ".") + backendReadTimeMillisecondsTotalCounterDesc = fuseMetricDesc("backend_read_time_milliseconds_total_counter", ".") + backendWriteTimeMillisecondsTotalCounterDesc = fuseMetricDesc("backend_write_time_milliseconds_total_counter", ".") + backendPosixGetAttrTotalCounterDesc = fuseMetricDesc("backend_posix_getattr_total_counter", ".") + backendPosixGetModeTotalCounterDesc = fuseMetricDesc("backend_posix_getmode_total_counter", ".") + backendPosixAccessTotalCounterDesc = fuseMetricDesc("backend_posix_access_total_counter", ".") + backendPosixLookupTotalCounterDesc = fuseMetricDesc("backend_posix_lookup_total_counter", ".") + backendPosixMknodTotalCounterDesc = fuseMetricDesc("backend_posix_mknod_total_counter", ".") + backendPosixRemoveTotalCounterDesc = fuseMetricDesc("backend_posix_remove_total_counter", ".") + backendPosixSetAttrTotalCounterDesc = fuseMetricDesc("backend_posix_setattr_total_counter", ".") + backendPosixLinkTotalCounterDesc = fuseMetricDesc("backend_posix_link_total_counter", ".") + backendPosixReadLinkTotalCounterDesc = fuseMetricDesc("backend_posix_readlink_total_counter", ".") + backendPosixStatfsTotalCounterDesc = fuseMetricDesc("backend_posix_statfs_total_counter", ".") + backendPosixRenameTotalCounterDesc = fuseMetricDesc("backend_posix_rename_total_counter", ".") + backendPosixReaddirTotalCounterDesc = fuseMetricDesc("backend_posix_readdir_total_counter", ".") + mountRetryTotalCounterDesc = fuseMetricDesc(utils.MetricsMountRetryCount, ".") + mountPointStatusDesc = fuseMetricDesc(utils.MetricsMountPointStatus, ".") + mountPointFailoverTotalCountDesc = fuseMetricDesc(utils.MetricsMountPointFailoverCount, ".") + lastFuseClientExitReasonDesc = fuseMetricDesc(utils.MetricsLastFuseClientExitReason, ".") ) type fuseInfo struct { @@ -353,9 +125,9 @@ func (p *usFsStatCollector) getCommonLabels(fsClientInfo *fuseInfo, fileName, ex } type usFsStatCollector struct { - hotSpotReadFileTop *prometheus.Desc - hotSpotWriteFileTop *prometheus.Desc - hotSpotHeadFileTop *prometheus.Desc + hotSpotReadFileTop *MetaDesc + hotSpotWriteFileTop *MetaDesc + hotSpotHeadFileTop *MetaDesc capacityBytesCounterDesc capacityBytesCounterDesc inodeBytesCounterDesc inodeBytesCounterDesc throughputBytesCounterDesc throughputBytesCounterDesc @@ -417,7 +189,7 @@ func init() { registerCollector("fuse_stat", NewFuseStatCollector, ossDriverName, nasDriverName, bmcpfsDriverName) } -// NewUsFsStatCollector returns a new Collector exposing user space fs stats. +// NewFuseStatCollector returns a new Collector exposing user space fs stats. func NewFuseStatCollector() (Collector, error) { return &usFsStatCollector{ hotSpotReadFileTop: hotSpotReadFileTopDesc, @@ -425,107 +197,107 @@ func NewFuseStatCollector() (Collector, error) { hotSpotHeadFileTop: hotSpotHeadFileTopDesc, capacityBytesCounterDesc: capacityBytesCounterDesc{ descs: []typedFactorDesc{ - {desc: capacityBytesUsedCounterDesc, valueType: prometheus.CounterValue}, - {desc: capacityBytesAvailableCounterDesc, valueType: prometheus.CounterValue}, - {desc: capacityBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: capacityBytesUsedCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: capacityBytesAvailableCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: capacityBytesTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, inodeBytesCounterDesc: inodeBytesCounterDesc{ descs: []typedFactorDesc{ - {desc: inodeBytesUsedCounterDesc, valueType: prometheus.CounterValue}, - {desc: inodeBytesAvailableCounterDesc, valueType: prometheus.CounterValue}, - {desc: inodeBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: inodeBytesUsedCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: inodeBytesAvailableCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: inodeBytesTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, throughputBytesCounterDesc: throughputBytesCounterDesc{ descs: []typedFactorDesc{ - {desc: readBytesTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: writeBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: readBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: writeBytesTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, iopsCompletedCounterDesc: iopsCompletedCounterDesc{ descs: []typedFactorDesc{ - {desc: readCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: writeCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: readCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: writeCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, latencyMillisecondsCounterDesc: latencyMillisecondsCounterDesc{ descs: []typedFactorDesc{ - {desc: readTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, - {desc: writeTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, + {MetaDesc: readTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, + {MetaDesc: writeTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, }, }, posixCounterDesc: posixCounterDesc{ descs: []typedFactorDesc{ - {desc: posixMkdirTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixRmdirTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixOpendirTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixReaddirTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixWriteTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixFlushTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixFsyncTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixReleaseTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixReadTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixCreateTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixOpenTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixAccessTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixRenameTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixChownTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixChmodTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: posixTruncateTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixMkdirTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixRmdirTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixOpendirTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixReaddirTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixWriteTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixFlushTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixFsyncTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixReleaseTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixReadTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixCreateTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixOpenTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixAccessTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixRenameTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixChownTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixChmodTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: posixTruncateTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, ossObjectCounterDesc: ossObjectCounterDesc{ descs: []typedFactorDesc{ - {desc: ossPutObjectTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: ossGetObjectTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: ossHeadObjectTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: ossDeleteObjectTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: ossPostObjectTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: ossPutObjectTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: ossGetObjectTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: ossHeadObjectTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: ossDeleteObjectTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: ossPostObjectTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, backendThroughputBytesCounterDesc: backendThroughputBytesCounterDesc{ descs: []typedFactorDesc{ - {desc: backendReadBytesTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendWriteBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendReadBytesTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendWriteBytesTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, backendIOPSCompletedCounterDesc: backendIOPSCompletedCounterDesc{ descs: []typedFactorDesc{ - {desc: backendReadCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendWriteCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendReadCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendWriteCompletedTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, backendLatencyMillisecondsCounterDesc: backendLatencyMillisecondsCounterDesc{ descs: []typedFactorDesc{ - {desc: backendReadTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, - {desc: backendWriteTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, + {MetaDesc: backendReadTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, + {MetaDesc: backendWriteTimeMillisecondsTotalCounterDesc, valueType: prometheus.CounterValue, factor: .001}, }, }, backendPosixCounterDesc: backendPosixCounterDesc{ descs: []typedFactorDesc{ - {desc: backendPosixGetAttrTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixGetModeTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixAccessTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixLookupTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixMknodTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixRemoveTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixSetAttrTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixLinkTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixReadLinkTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixStatfsTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixRenameTotalCounterDesc, valueType: prometheus.CounterValue}, - {desc: backendPosixReaddirTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixGetAttrTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixGetModeTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixAccessTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixLookupTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixMknodTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixRemoveTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixSetAttrTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixLinkTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixReadLinkTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixStatfsTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixRenameTotalCounterDesc, valueType: prometheus.CounterValue}, + {MetaDesc: backendPosixReaddirTotalCounterDesc, valueType: prometheus.CounterValue}, }, }, - mountRetryTotalCounter: &typedFactorDesc{desc: mountRetryTotalCounterDesc, valueType: prometheus.CounterValue}, - mountPointStatus: &typedFactorDesc{desc: mountPointStatusDesc, valueType: prometheus.GaugeValue}, - mountPointFailoverTotalCounter: &typedFactorDesc{desc: mountPointFailoverTotalCountDesc, valueType: prometheus.CounterValue}, - lastFuseClientExitReason: &typedFactorDesc{desc: lastFuseClientExitReasonDesc, valueType: prometheus.GaugeValue}, + mountRetryTotalCounter: &typedFactorDesc{MetaDesc: mountRetryTotalCounterDesc, valueType: prometheus.CounterValue}, + mountPointStatus: &typedFactorDesc{MetaDesc: mountPointStatusDesc, valueType: prometheus.GaugeValue}, + mountPointFailoverTotalCounter: &typedFactorDesc{MetaDesc: mountPointFailoverTotalCountDesc, valueType: prometheus.CounterValue}, + lastFuseClientExitReason: &typedFactorDesc{MetaDesc: lastFuseClientExitReasonDesc, valueType: prometheus.GaugeValue}, }, nil } -func (p *usFsStatCollector) postHotTopFileMetrics(hotSpotType string, fsClientInfo *fuseInfo, metricsArray []string, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) postHotTopFileMetrics(hotSpotType string, fsClientInfo *fuseInfo, metricsArray []string) (metrics []*Metric) { for _, metricsValue := range metricsArray { start := strings.LastIndex(metricsValue, ":") if start == -1 { @@ -540,18 +312,19 @@ func (p *usFsStatCollector) postHotTopFileMetrics(hotSpotType string, fsClientIn labels := p.getCommonLabels(fsClientInfo, fileName, "") switch hotSpotType { case "hot_spot_read_file_top": - ch <- prometheus.MustNewConstMetric(p.hotSpotReadFileTop, prometheus.GaugeValue, valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithMetaDesc(p.hotSpotReadFileTop, valueFloat64, prometheus.GaugeValue, labels...)) case "hot_spot_write_file_top": - ch <- prometheus.MustNewConstMetric(p.hotSpotWriteFileTop, prometheus.GaugeValue, valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithMetaDesc(p.hotSpotWriteFileTop, valueFloat64, prometheus.GaugeValue, labels...)) case "hot_spot_head_file_top": - ch <- prometheus.MustNewConstMetric(p.hotSpotHeadFileTop, prometheus.GaugeValue, valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithMetaDesc(p.hotSpotHeadFileTop, valueFloat64, prometheus.GaugeValue, labels...)) default: klog.Errorf("Unknown hotSpotType:%s", hotSpotType) } } + return } -func (p *usFsStatCollector) postCounterMetrics(counterType string, fsClientInfo *fuseInfo, metricsArray []string, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) postCounterMetrics(counterType string, fsClientInfo *fuseInfo, metricsArray []string) (metrics []*Metric) { if len(metricsArray) == 0 { return } @@ -572,44 +345,46 @@ func (p *usFsStatCollector) postCounterMetrics(counterType string, fsClientInfo if i >= len(p.capacityBytesCounterDesc.descs) { return } - ch <- p.capacityBytesCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.capacityBytesCounterDesc.descs[i], valueFloat64, labels...)) case "inodes_counter": if i >= len(p.inodeBytesCounterDesc.descs) { return } - ch <- p.inodeBytesCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.inodeBytesCounterDesc.descs[i], valueFloat64, labels...)) + case "throughput_counter": if i >= len(p.throughputBytesCounterDesc.descs) { return } - ch <- p.throughputBytesCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.throughputBytesCounterDesc.descs[i], valueFloat64, labels...)) case "iops_counter": if i >= len(p.iopsCompletedCounterDesc.descs) { return } - ch <- p.iopsCompletedCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.iopsCompletedCounterDesc.descs[i], valueFloat64, labels...)) case "latency_counter": if i >= len(p.latencyMillisecondsCounterDesc.descs) { return } - ch <- p.latencyMillisecondsCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.latencyMillisecondsCounterDesc.descs[i], valueFloat64, labels...)) case "posix_counter": if i >= len(p.posixCounterDesc.descs) { return } - ch <- p.posixCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.posixCounterDesc.descs[i], valueFloat64, labels...)) case "oss_object_counter": if i >= len(p.ossObjectCounterDesc.descs) { return } - ch <- p.ossObjectCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.ossObjectCounterDesc.descs[i], valueFloat64, labels...)) default: klog.Errorf("Unknown counterType:%s", counterType) } } + return } -func (p *usFsStatCollector) postBackendCounterMetrics(counterType string, fsClientInfo *fuseInfo, metricsArray []string, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) postBackendCounterMetrics(counterType string, fsClientInfo *fuseInfo, metricsArray []string) (metrics []*Metric) { if len(metricsArray) == 0 { return } @@ -630,34 +405,35 @@ func (p *usFsStatCollector) postBackendCounterMetrics(counterType string, fsClie if i >= len(p.backendIOPSCompletedCounterDesc.descs) { return } - ch <- p.backendIOPSCompletedCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.backendIOPSCompletedCounterDesc.descs[i], valueFloat64, labels...)) case "backend_latency_counter": if i >= len(p.backendLatencyMillisecondsCounterDesc.descs) { return } - ch <- p.backendLatencyMillisecondsCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.backendLatencyMillisecondsCounterDesc.descs[i], valueFloat64, labels...)) case "backend_meta_qps_ounter": if i >= len(p.backendPosixCounterDesc.descs) { return } - ch <- p.backendPosixCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.backendPosixCounterDesc.descs[i], valueFloat64, labels...)) case "backend_throughput_counter": if i >= len(p.backendThroughputBytesCounterDesc.descs) { return } - ch <- p.backendThroughputBytesCounterDesc.descs[i].mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.backendThroughputBytesCounterDesc.descs[i], valueFloat64, labels...)) default: klog.Errorf("Unknown counterType:%s", counterType) } } + return } -func (p *usFsStatCollector) postMountPointStatusMetrics(statusType string, fsClientInfo *fuseInfo, metricsArray []string, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) postMountPointStatusMetrics(statusType string, fsClientInfo *fuseInfo, metricsArray []string) (metrics []*Metric) { var err error for _, value := range metricsArray { if statusType == utils.MetricsLastFuseClientExitReason { labels := p.getCommonLabels(fsClientInfo, "", value) - ch <- p.lastFuseClientExitReason.mustNewConstMetric(1, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(*p.lastFuseClientExitReason, 1, labels...)) continue } @@ -671,13 +447,14 @@ func (p *usFsStatCollector) postMountPointStatusMetrics(statusType string, fsCli labels := p.getCommonLabels(fsClientInfo, "", "") switch statusType { case utils.MetricsMountRetryCount: - ch <- p.mountRetryTotalCounter.mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(*p.mountRetryTotalCounter, valueFloat64, labels...)) case utils.MetricsMountPointStatus: - ch <- p.mountPointStatus.mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(*p.mountPointStatus, valueFloat64, labels...)) case utils.MetricsMountPointFailoverCount: - ch <- p.mountPointFailoverTotalCounter.mustNewConstMetric(valueFloat64, labels...) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(*p.mountPointFailoverTotalCounter, valueFloat64, labels...)) } } + return } var ( @@ -701,9 +478,17 @@ func getFuseMetricDirs() map[string]string { } func (p *usFsStatCollector) Update(ch chan<- prometheus.Metric) error { - metricDirs := getFuseMetricDirs() + metrics := p.Get() + for _, metric := range metrics { + ch <- prometheus.MustNewConstMetric(metric.Desc, metric.ValueType, metric.Value, convertLabelsToString(metric.VariableLabelPairs)...) + } + return nil +} + +func (p *usFsStatCollector) Get() (metrics []*Metric) { + metricsDir := getFuseMetricDirs() fsClientInfo := new(fuseInfo) - for _, dir := range metricDirs { + for _, dir := range metricsDir { if !utils.IsFileExisting(dir) { continue } @@ -716,17 +501,17 @@ func (p *usFsStatCollector) Update(ch chan<- prometheus.Metric) error { for _, subdir := range subdirs { if utils.IsFileExisting(filepath.Join(dir, subdir, utils.PodInfoFile)) { // exclusive metrics case, subdir is podUid - p.updateExclusiveMetrics(dir, subdir, fsClientInfo, ch) + metrics = append(metrics, p.updateExclusiveMetrics(dir, subdir, fsClientInfo)...) continue } // shared metrics case, subdir is sha256(pvname) - p.updateSharedMetrics(dir, subdir, fsClientInfo, ch) + metrics = append(metrics, p.updateSharedMetrics(dir, subdir, fsClientInfo)...) } } - return nil + return } -func (p *usFsStatCollector) updateExclusiveMetrics(fuseMetricsDir, podUid string, fsClientInfo *fuseInfo, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) updateExclusiveMetrics(fuseMetricsDir, podUid string, fsClientInfo *fuseInfo) (metrics []*Metric) { //get pod info podInfoArray, err := readFirstLines(filepath.Join(fuseMetricsDir, podUid, utils.PodInfoFile)) if err != nil { @@ -747,17 +532,18 @@ func (p *usFsStatCollector) updateExclusiveMetrics(fuseMetricsDir, podUid string // foreach volume for _, volume := range volumeArray { volPath := filepath.Join(fuseMetricsDir, podUid, volume) - p.postVolMetrics(volPath, fsClientInfo, ch) + metrics = append(metrics, p.postVolMetrics(volPath, fsClientInfo)...) } + return } -func (p *usFsStatCollector) updateSharedMetrics(fuseMetricsDir, subDir string, fsClientInfo *fuseInfo, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) updateSharedMetrics(fuseMetricsDir, subDir string, fsClientInfo *fuseInfo) (metrics []*Metric) { // /var/run/fsType/sha256(pvname) volPath := filepath.Join(fuseMetricsDir, subDir) - p.postVolMetrics(volPath, fsClientInfo, ch) + return p.postVolMetrics(volPath, fsClientInfo) } -func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInfo, ch chan<- prometheus.Metric) { +func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInfo) (metrics []*Metric) { mountPointInfoArray, err := readFirstLines(filepath.Join(volPath, utils.MountPointInfoFile)) if err != nil { return @@ -777,7 +563,7 @@ func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInf if err != nil { continue } - p.postCounterMetrics(counterType, fsClientInfo, metricsArray, ch) + metrics = append(metrics, p.postCounterMetrics(counterType, fsClientInfo, metricsArray)...) } // foreach hot_top_file metrics for _, hotSpotType := range hotSpotArray { @@ -785,7 +571,7 @@ func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInf if err != nil { continue } - p.postHotTopFileMetrics(hotSpotType, fsClientInfo, metricsArray, ch) + metrics = append(metrics, p.postHotTopFileMetrics(hotSpotType, fsClientInfo, metricsArray)...) } // foreach backend counter metrics for _, backendCounterType := range backendCounterTypeArray { @@ -793,7 +579,7 @@ func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInf if err != nil { continue } - p.postBackendCounterMetrics(backendCounterType, fsClientInfo, metricsArray, ch) + metrics = append(metrics, p.postBackendCounterMetrics(backendCounterType, fsClientInfo, metricsArray)...) } // foreach mountpoint status related metrics for _, mountPointStatus := range mountPointStatusArray { @@ -810,6 +596,7 @@ func (p *usFsStatCollector) postVolMetrics(volPath string, fsClientInfo *fuseInf continue } } - p.postMountPointStatusMetrics(mountPointStatus, fsClientInfo, metricsArray, ch) + metrics = append(metrics, p.postMountPointStatusMetrics(mountPointStatus, fsClientInfo, metricsArray)...) } + return } diff --git a/pkg/metric/kubelet_stats_summary_collector.go b/pkg/metric/kubelet_stats_summary_collector.go index e239b6df64..6668d5712d 100644 --- a/pkg/metric/kubelet_stats_summary_collector.go +++ b/pkg/metric/kubelet_stats_summary_collector.go @@ -2,7 +2,6 @@ package metric import ( "encoding/json" - "fmt" "io" "net/http" "time" @@ -10,6 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "k8s.io/client-go/rest" "k8s.io/client-go/transport" + "k8s.io/klog/v2" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" ) @@ -21,15 +21,30 @@ const ( type fsStatsMetric struct { desc *prometheus.Desc + fqName string + help string getValue func(*statsapi.FsStats) *float64 } -func (m *fsStatsMetric) Metric(fsStats *statsapi.FsStats, labels ...string) prometheus.Metric { +func (m *fsStatsMetric) Metric(fsStats *statsapi.FsStats, labels []string, labelValues ...string) *Metric { value := m.getValue(fsStats) if value == nil { return nil } - return prometheus.MustNewConstMetric(m.desc, prometheus.GaugeValue, *value, labels...) + + desc := &MetaDesc{ + Desc: prometheus.NewDesc(m.fqName, m.help, labels, nil), + FQName: m.fqName, + Help: m.help, + VariableLabels: labels, + } + + metric, err := NewMetric(desc, *value, prometheus.GaugeValue, labelValues...) + if err != nil { + klog.ErrorS(err, "Failed to create metric", "desc", m.desc) + return nil + } + return metric } func uint64ToFloat64(value *uint64) *float64 { @@ -41,37 +56,45 @@ func uint64ToFloat64(value *uint64) *float64 { } func generateFsStatsDescs(namespace, subsystem string, labels []string) []*fsStatsMetric { - return []*fsStatsMetric{ - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "inodes_free"), "Number of available Inodes", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.InodesFree) }, - }, - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "inodes_total"), "Total number of Inodes", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.Inodes) }, - }, - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "inodes_used"), "Number of used Inodes", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.InodesUsed) }, - }, - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "limit_bytes"), "Number of bytes that can be consumed by the container on this filesystem", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.CapacityBytes) }, - }, - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "usage_bytes"), "Number of bytes that are consumed by the container on this filesystem", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.UsedBytes) }, - }, - { - desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "available_bytes"), "Number of bytes that not consumed", labels, nil), - getValue: func(fs *statsapi.FsStats) *float64 { return uint64ToFloat64(fs.AvailableBytes) }, - }, + var metrics []*fsStatsMetric + + addMetric := func(name, help string, fn func(*statsapi.FsStats) *float64) { + fqName := prometheus.BuildFQName(namespace, subsystem, name) + metrics = append(metrics, &fsStatsMetric{ + desc: prometheus.NewDesc(fqName, help, labels, nil), + fqName: fqName, + help: help, + getValue: fn, + }) } + + addMetric("inodes_free", "Number of available Inodes", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.InodesFree) + }) + addMetric("inodes_total", "Total number of Inodes", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.Inodes) + }) + addMetric("inodes_used", "Number of used Inodes", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.InodesUsed) + }) + addMetric("limit_bytes", "Number of bytes that can be consumed by the container on this filesystem", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.CapacityBytes) + }) + addMetric("usage_bytes", "Number of bytes that are consumed by the container on this filesystem", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.UsedBytes) + }) + addMetric("available_bytes", "Number of bytes that not consumed", func(fs *statsapi.FsStats) *float64 { + return uint64ToFloat64(fs.AvailableBytes) + }) + + return metrics } var ( - rootfsMetrics = generateFsStatsDescs("container", "fs", []string{"namespace", "pod", "pod_uid", "container"}) - ephemeralStorageMetrics = generateFsStatsDescs("ephemeral_storage", "pod", []string{"namespace", "pod", "pod_uid"}) + rootfsLabels = []string{"namespace", "pod", "pod_uid", "container"} + ephemeralStorageLabels = []string{"namespace", "pod", "pod_uid"} + rootfsMetrics = generateFsStatsDescs("container", "fs", rootfsLabels) + ephemeralStorageMetrics = generateFsStatsDescs("ephemeral_storage", "pod", ephemeralStorageLabels) ) func init() { @@ -83,25 +106,36 @@ type kubeletStatsSummaryCollector struct { } func (c *kubeletStatsSummaryCollector) Update(ch chan<- prometheus.Metric) error { + metrics := c.Get() + for _, metric := range metrics { + ch <- prometheus.MustNewConstMetric(metric.Desc, metric.ValueType, metric.Value, convertLabelsToString(metric.VariableLabelPairs)...) + } + return nil +} + +func (c *kubeletStatsSummaryCollector) Get() (metrics []*Metric) { resp, err := c.client.Get(kubeletStatsSummaryUrl) if err != nil { - return err + klog.ErrorS(err, "Failed to get kubelet stats summary") + return } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) - return fmt.Errorf("failed to get kubelet stats summary: %d: %q", resp.StatusCode, string(body)) + klog.ErrorS(err, "Failed to read get kubelet stats summary", "statusCode", resp.StatusCode, "body", string(body)) + return } var summary statsapi.Summary if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil { - return err + klog.ErrorS(err, "Failed to decode kubelet stats summary") + return } for _, pod := range summary.Pods { if pod.EphemeralStorage != nil { for _, m := range ephemeralStorageMetrics { - metric := m.Metric(pod.EphemeralStorage, pod.PodRef.Namespace, pod.PodRef.Name, pod.PodRef.UID) + metric := m.Metric(pod.EphemeralStorage, ephemeralStorageLabels, pod.PodRef.Namespace, pod.PodRef.Name, pod.PodRef.UID) if metric != nil { - ch <- metric + metrics = append(metrics, metric) } } } @@ -109,15 +143,15 @@ func (c *kubeletStatsSummaryCollector) Update(ch chan<- prometheus.Metric) error for _, container := range pod.Containers { if container.Rootfs != nil { for _, m := range rootfsMetrics { - metric := m.Metric(container.Rootfs, pod.PodRef.Namespace, pod.PodRef.Name, pod.PodRef.UID, container.Name) + metric := m.Metric(container.Rootfs, rootfsLabels, pod.PodRef.Namespace, pod.PodRef.Name, pod.PodRef.UID, container.Name) if metric != nil { - ch <- metric + metrics = append(metrics, metric) } } } } } - return nil + return } func NewKubeletStatsSummaryCollector() (Collector, error) { diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index bc59938b81..00b4faa4fb 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -44,7 +44,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // NewMetricHandler method returns a promHttp object func NewMetricHandler(driverNames []string, serviceType utils.ServiceType) *Handler { //csi collector singleton - newCSICollector(driverNames, serviceType) + initCSICollector(driverNames, serviceType) return newHandler() } diff --git a/pkg/metric/monitor_client.go b/pkg/metric/monitor_client.go deleted file mode 100644 index 158c669897..0000000000 --- a/pkg/metric/monitor_client.go +++ /dev/null @@ -1,114 +0,0 @@ -package metric - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "net" - "net/http" - "net/url" - "sync" - "time" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "k8s.io/klog/v2" -) - -const ( - storageMonitorSvcName = "storage-monitor-service" - storageMonitorPort = "11280" - storageMonitorClientTimeout = time.Millisecond * 500 - monitorIPUpdateInterval = time.Minute * 30 -) - -type StorageMonitorClient struct { - sync.RWMutex - clusterIP string - kubeClient kubernetes.Interface -} - -func NewStorageMonitorClient(kubeClient kubernetes.Interface) *StorageMonitorClient { - c := &StorageMonitorClient{ - kubeClient: kubeClient, - } - if err := c.updateClusterIP(); err != nil { - klog.Warningf("failed to get clusterIP of storage-monitor: %v", err) - } - - var ( - lastError error - lastLogTime time.Time - ) - - go func() { - ticker := time.NewTicker(monitorIPUpdateInterval) - defer ticker.Stop() - for range ticker.C { - err := c.updateClusterIP() - if err != nil { - now := time.Now() - // ignore recent and duplicated error - if !(lastError != nil && lastError.Error() == err.Error() && now.Sub(lastLogTime) < monitorIPUpdateInterval*24) { - klog.Warningf("failed to update clusterIP of storage-monitor: %v", err) - lastLogTime = now - } - } - lastError = err - } - }() - return c -} - -func (c *StorageMonitorClient) GetNasCapacityInfo(pv string) (*nfsCapacityInfo, error) { - c.RLock() - clusterIP := c.clusterIP - c.RUnlock() - if clusterIP == "" { - return nil, errors.New("clusterIP not initialized") - } - params := url.Values{"multi-cnfs-nas": []string{pv}} - uri := url.URL{ - Scheme: "http", - Host: net.JoinHostPort(clusterIP, storageMonitorPort), - Path: "/metrics", - RawQuery: params.Encode(), - } - resp, err := (&http.Client{Timeout: storageMonitorClientTimeout}).Get(uri.String()) - if err != nil { - return nil, err - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("http response status: %s", resp.Status) - } - var capacityInfos map[string]nfsCapacityInfo - err = json.NewDecoder(resp.Body).Decode(&capacityInfos) - if err != nil { - return nil, err - } - capacityInfo, ok := capacityInfos[pv] - if ok { - return &capacityInfo, nil - } - return nil, nil -} - -func (c *StorageMonitorClient) updateClusterIP() error { - svc, err := c.kubeClient.CoreV1().Services("kube-system").Get(context.Background(), storageMonitorSvcName, v1.GetOptions{}) - if err != nil { - return err - } - currentClusterIP := svc.Spec.ClusterIP - if currentClusterIP == "" { - return errors.New("spec.ClusterIP of service is empty") - } - c.Lock() - defer c.Unlock() - if c.clusterIP != currentClusterIP { - klog.Infof("update storage-monitor IP: %s", currentClusterIP) - c.clusterIP = currentClusterIP - } - return nil -} diff --git a/pkg/metric/nfs_stat_collector.go b/pkg/metric/nfs_stat_collector.go index 0b55a88363..0a1d3e0e60 100644 --- a/pkg/metric/nfs_stat_collector.go +++ b/pkg/metric/nfs_stat_collector.go @@ -10,11 +10,8 @@ import ( "strings" "sync" - "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/options" "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/utils" "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/api/core/v1" - "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" @@ -30,132 +27,32 @@ const ( GiBSize = 1024 * 1024 * 1024 ) -var ( - //0 - reads completed successfully - nfsReadsCompletedDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_completed_total"), - "The total number of reads completed successfully.", - nfsStatLabelNames, nil, - ) - //1 - reads transmissions successfully - nfsReadsTransDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_transmissions_total"), - "How many transmissions of this op type have been sent.", - nfsStatLabelNames, nil, - ) - //2 - read timeout - nfsReadsTimeOutDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_timeouts_total"), - "How many timeouts of this op type have occurred.", - nfsStatLabelNames, nil, - ) - //3 - read send bytes - nfsReadsSentBytesDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_sent_bytes_total"), - "How many bytes have been sent for this op type.", - nfsStatLabelNames, nil, - ) - //4 - read recv bytes - nfsReadsRecvBytesDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_bytes_total"), - "The total number of bytes read successfully.", - nfsStatLabelNames, nil, - ) - //5 - read queue time - nfsReadsQueueTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_queue_time_milliseconds_total"), - "How long ops of this type have waited in queue before being transmitted (microsecond).", - nfsStatLabelNames, nil, - ) - //6 - read rtt time - nfsReadsRttTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_rtt_time_milliseconds_total"), - "How long the client waited to receive replies of this op type from the server (microsecond).", - nfsStatLabelNames, nil, - ) - //7 - read execute time - nfsReadsExecuteTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "read_time_milliseconds_total"), - "The total number of milliseconds spent by all reads.", - nfsStatLabelNames, nil, - ) - //8 - writes completed successfully - nfsWritesCompletedDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_completed_total"), - "The total number of writes completed successfully.", - nfsStatLabelNames, nil, - ) - //9 - writes transmissions successfully - nfsWritesTransDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_transmissions_total"), - "How many transmissions of this op type have been sent.", - nfsStatLabelNames, nil, - ) - //10 - writes timeout - nfsWritesTimeOutDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_timeouts_total"), - "How many timeouts of this op type have occurred.", - nfsStatLabelNames, nil, - ) - //11 - writes send bytes - nfsWritesSentBytesDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_bytes_total"), - "The total number of bytes written successfully.", - nfsStatLabelNames, nil, - ) - //12 - writes recv bytes - nfsWritesRecvBytesDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_recv_bytes_total"), - "How many bytes have been received for this op type.", - nfsStatLabelNames, nil, - ) - //13 - writes queue time - nfsWritesQueueTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_queue_time_milliseconds_total"), - "How long ops of this type have waited in queue before being transmitted (microsecond).", - nfsStatLabelNames, nil, - ) - //14 - writes rtt time - nfsWritesRttTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_rtt_time_milliseconds_total"), - "How long the client waited to receive replies of this op type from the server (microsecond).", - nfsStatLabelNames, nil, - ) - //15 - writes execute time - nfsWritesExecuteTimeMilliSecondsDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "write_time_milliseconds_total"), - "The total number of milliseconds spent by all writes.", - nfsStatLabelNames, nil, - ) - //16 - capacity available - nfsCapacityAvailableDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_available"), - "The number of available size(bytes).", - nfsStatLabelNames, - nil, - ) - - //17 - capacity total - nfsCapacityTotalDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_total"), - "The number of total size(bytes).", - nfsStatLabelNames, - nil, - ) +func nfsMetricDesc(name, help string) *MetaDesc { + return NewMetaDesc(nodeNamespace, volumeSubsystem, name, help, nfsStatLabelNames, nil) +} - //18 - capacity used - nfsCapacityUsedDesc = prometheus.NewDesc( - prometheus.BuildFQName(nodeNamespace, volumeSubsystem, "capacity_bytes_used"), - "The number of used size(bytes).", - nfsStatLabelNames, - nil, - ) +var ( + nfsReadsCompletedDesc = nfsMetricDesc("read_completed_total", "The total number of reads completed successfully.") + nfsReadsTransDesc = nfsMetricDesc("read_transmissions_total", "How many transmissions of this op type have been sent.") + nfsReadsTimeoutDesc = nfsMetricDesc("read_timeouts_total", "How many timeouts of this op type have occurred.") + nfsReadsSentBytesDesc = nfsMetricDesc("read_sent_bytes_total", "How many bytes have been sent for this op type.") + nfsReadsRecvBytesDesc = nfsMetricDesc("read_bytes_total", "The total number of bytes read successfully.") + nfsReadsQueueTimeMilliSecondsDesc = nfsMetricDesc("read_queue_time_milliseconds_total", "How long ops of this type have waited in queue before being transmitted (microsecond).") + nfsReadsRttTimeMilliSecondsDesc = nfsMetricDesc("read_rtt_time_milliseconds_total", "How long the client waited to receive replies of this op type from the server (microsecond).") + nfsReadsExecuteTimeMilliSecondsDesc = nfsMetricDesc("read_time_milliseconds_total", "The total number of milliseconds spent by all reads.") + nfsWritesCompletedDesc = nfsMetricDesc("write_completed_total", "The total number of writes completed successfully.") + nfsWritesTransDesc = nfsMetricDesc("write_transmissions_total", "How many transmissions of this op type have been sent.") + nfsWritesTimeoutDesc = nfsMetricDesc("write_timeouts_total", "How many timeouts of this op type have occurred.") + nfsWritesSentBytesDesc = nfsMetricDesc("write_bytes_total", "The total number of bytes written successfully.") + nfsWritesRecvBytesDesc = nfsMetricDesc("write_recv_bytes_total", "How many bytes have been received for this op type.") + nfsWritesQueueTimeMilliSecondsDesc = nfsMetricDesc("write_queue_time_milliseconds_total", "How long ops of this type have waited in queue before being transmitted (microsecond).") + nfsWritesRttTimeMilliSecondsDesc = nfsMetricDesc("write_rtt_time_milliseconds_total", "How long the client waited to receive replies of this op type from the server (microsecond).") + nfsWritesExecuteTimeMilliSecondsDesc = nfsMetricDesc("write_time_milliseconds_total", "The total number of milliseconds spent by all writes.") ) type nfsInfo struct { PvcNamespace string PvcName string - ServerName string VolDataPath string } @@ -169,136 +66,94 @@ type nfsCapacityInfo struct { } type nfsStatCollector struct { - descs []typedFactorDesc - pvInfoLock sync.Mutex - lastPvNfsInfoMap map[string]nfsInfo - lastPvStatsMap sync.Map - clientSet *kubernetes.Clientset - crdClient dynamic.Interface - monitorClient *StorageMonitorClient - recorder record.EventRecorder - capacityPercentageThreshold float64 - mounter mount.Interface + descs []typedFactorDesc + pvInfoLock sync.Mutex + lastPvNfsInfoMap map[string]nfsInfo + lastPvStatsMap sync.Map + client kubernetes.Interface + recorder record.EventRecorder + mounter mount.Interface } func init() { registerCollector("nfs_stat", NewNfsStatCollector, nasDriverName) } -func getNfsCapacityThreshold() float64 { - capacityStr := strings.ToLower(strings.Trim(os.Getenv("NFS_CAPACITY_THRESHOLD_PERCENTAGE"), " ")) - if len(capacityStr) != 0 { - capacity, _ := parseCapacityThreshold(capacityStr, nfsDefaultsCapacityPercentageThreshold) - return capacity - } - return 0 -} - // NewNfsStatCollector returns a new Collector exposing nfs stats. func NewNfsStatCollector() (Collector, error) { - config, err := options.GetRestConfig() - if err != nil { - return nil, err - } recorder := utils.NewEventRecorder() - // creates the clientset - clientset, err := kubernetes.NewForConfig(config) + client, err := newK8sClient() if err != nil { return nil, err } - crdCfg := options.GetRestConfigForCRD(*config) - crdClient, err := dynamic.NewForConfig(crdCfg) - if err != nil { - klog.Fatalf("Failed to create crd client: %v", err) - } - return &nfsStatCollector{ descs: []typedFactorDesc{ //read - {desc: nfsReadsCompletedDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsTransDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsTimeOutDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsSentBytesDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsRecvBytesDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsQueueTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsRttTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, - {desc: nfsReadsExecuteTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsCompletedDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsTransDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsTimeoutDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsSentBytesDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsRecvBytesDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsQueueTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsRttTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsReadsExecuteTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, //write - {desc: nfsWritesCompletedDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesTransDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesTimeOutDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesSentBytesDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesRecvBytesDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesQueueTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesRttTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, - {desc: nfsWritesExecuteTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, - // capacity - {desc: nfsCapacityTotalDesc, valueType: prometheus.GaugeValue}, - {desc: nfsCapacityUsedDesc, valueType: prometheus.GaugeValue}, - {desc: nfsCapacityAvailableDesc, valueType: prometheus.GaugeValue}, + {MetaDesc: nfsWritesCompletedDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesTransDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesTimeoutDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesSentBytesDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesRecvBytesDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesQueueTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesRttTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, + {MetaDesc: nfsWritesExecuteTimeMilliSecondsDesc, valueType: prometheus.CounterValue}, }, - lastPvNfsInfoMap: make(map[string]nfsInfo, 0), - lastPvStatsMap: sync.Map{}, - clientSet: clientset, - crdClient: crdClient, - recorder: recorder, - monitorClient: NewStorageMonitorClient(clientset), - capacityPercentageThreshold: getNfsCapacityThreshold(), - mounter: mount.NewWithoutSystemd(""), + lastPvNfsInfoMap: make(map[string]nfsInfo, 0), + lastPvStatsMap: sync.Map{}, + client: client, + recorder: recorder, + mounter: mount.NewWithoutSystemd(""), }, nil } func (p *nfsStatCollector) Update(ch chan<- prometheus.Metric) error { - //startTime := time.Now() - pvNameStatsMap, err := getNfsStat() + metrics := p.Get() + for _, metric := range metrics { + ch <- prometheus.MustNewConstMetric(metric.Desc, metric.ValueType, metric.Value, convertLabelsToString(metric.VariableLabelPairs)...) + } + return nil +} + +func (p *nfsStatCollector) Get() (metrics []*Metric) { + pvNameServerMap, pvNameStatsMap, err := getNfsStat() if len(pvNameStatsMap) == 0 { - return nil + klog.V(2).InfoS("No nfs stats found") + return } if err != nil { - return fmt.Errorf("couldn't get nfsstats: %s", err) + klog.ErrorS(err, "couldn't get nfsstats") + return } volJSONPaths, err := findVolJSON(podsRootPath) if err != nil { - return err + klog.ErrorS(err, "couldn't find vol json", "podRootPath", podsRootPath) + return } - //log.Infof("volJSONPaths:%+v", volJSONPaths) p.updateMap(&p.lastPvNfsInfoMap, volJSONPaths, nasDriverName) - //log.Infof("lastPvNfsInfoMap:%+v", p.lastPvNfsInfoMap) - wg := sync.WaitGroup{} for pvName, stats := range pvNameStatsMap { nfsInfo := p.lastPvNfsInfoMap[pvName] - //klog.Infof("pv: %s, stats: %v, nfsInfo: %+v", pvName, stats, nfsInfo) - capacityStats, err := getNfsCapacityStat(pvName, nfsInfo, p) - if err != nil { - //klog.Errorf("get capacity of PV %s: %v", pvName, err) - stats = append(stats, UnknownValue, UnknownValue, UnknownValue) - } else { - stats = append(stats, capacityStats...) - } - - wg.Add(1) - go func(pvNameArgs string, pvcNamespaceArgs string, pvcNameArgs string, serverNameArgs string, statsArgs []string) { - defer wg.Done() - p.setNfsMetric(pvNameArgs, pvcNamespaceArgs, pvcNameArgs, serverNameArgs, statsArgs, ch) - }(pvName, nfsInfo.PvcNamespace, nfsInfo.PvcName, nfsInfo.ServerName, stats) + metrics = append(metrics, p.getNfsMetrics(pvName, nfsInfo.PvcNamespace, nfsInfo.PvcName, pvNameServerMap[pvName], stats)...) } - wg.Wait() - //elapsedTime := time.Since(startTime) - //logrus.Info("Nfsstat spent time:", elapsedTime) - return nil + return } -func (p *nfsStatCollector) setNfsMetric(pvName string, pvcNamespace string, pvcName string, serverName string, stats []string, ch chan<- prometheus.Metric) { +func (p *nfsStatCollector) getNfsMetrics(pvName string, pvcNamespace string, pvcName string, serverName string, stats []string) (metrics []*Metric) { defer p.lastPvStatsMap.Store(pvName, stats) for i, value := range stats { if i >= len(p.descs) { return } - if value == UnknownValue { - continue - } valueFloat64, err := strconv.ParseFloat(value, 64) if err != nil { @@ -306,8 +161,9 @@ func (p *nfsStatCollector) setNfsMetric(pvName string, pvcNamespace string, pvcN continue } - ch <- p.descs[i].mustNewConstMetric(valueFloat64, pvcNamespace, pvcName, serverName, nasStorageName) + metrics = append(metrics, MustNewMetricWithTypedFactorDesc(p.descs[i], valueFloat64, pvcNamespace, pvcName, serverName, nasStorageName)) } + return } func (p *nfsStatCollector) updateMap(lastPvNfsInfoMap *map[string]nfsInfo, jsonPaths []string, deriverName string) { @@ -352,7 +208,7 @@ func (p *nfsStatCollector) updateNfsInfoMap(thisPvNfsInfoMap map[string]nfsInfo, lastInfo, ok := (*lastPvNfsInfoMap)[pv] // add and modify if !ok || thisInfo.VolDataPath != lastInfo.VolDataPath { - pvcNamespace, pvcName, serverName, err := getNasPvcByPvName(p.clientSet, p.crdClient, pv) + pvcNamespace, pvcName, err := getNasPvcByPvName(p.client, pv, thisInfo.VolDataPath) if err != nil { continue } @@ -360,7 +216,6 @@ func (p *nfsStatCollector) updateNfsInfoMap(thisPvNfsInfoMap map[string]nfsInfo, VolDataPath: thisInfo.VolDataPath, PvcName: pvcName, PvcNamespace: pvcNamespace, - ServerName: serverName, } (*lastPvNfsInfoMap)[pv] = updateInfo } @@ -374,36 +229,53 @@ func (p *nfsStatCollector) updateNfsInfoMap(thisPvNfsInfoMap map[string]nfsInfo, } } -func getNfsStat() (map[string][]string, error) { +func getNfsStat() (map[string]string, map[string][]string, error) { + pvServerMapping := make(map[string]string, 0) pvNameStatMapping := make(map[string][]string, 0) mountStatsFile, err := os.Open(nfsStatsFileName) if mountStatsFile == nil { - return nil, fmt.Errorf("File %s is not found.", nfsStatsFileName) + return nil, nil, fmt.Errorf("file %s not found", nfsStatsFileName) } if err != nil { - return nil, fmt.Errorf("Open file %s is error, err:%s", nfsStatsFileName, err) + return nil, nil, fmt.Errorf("failed to open file %s: %s", nfsStatsFileName, err) } defer mountStatsFile.Close() mountArr, err := parseMountStats(mountStatsFile) if err != nil { - return nil, fmt.Errorf("ParseMountStats %s is error, err:%s.", nfsStatsFileName, err) + return nil, nil, fmt.Errorf("failed to parse mount stats %s: %s", nfsStatsFileName, err) } for _, mount := range mountArr { + pvName := getPVName(mount) + segments := strings.Split(mount.Device, ":") + if len(segments) >= 2 { + pvServerMapping[pvName] = segments[0] + } else { + pvServerMapping[pvName] = mount.Device + } nfsOperationStats := mount.Stats.operationStats() for _, operation := range nfsOperationStats { - addNfsStat(&pvNameStatMapping, mount.Mount, operation, "READ") + addNfsStat(&pvNameStatMapping, pvName, operation, "READ") } for _, operation := range nfsOperationStats { - addNfsStat(&pvNameStatMapping, mount.Mount, operation, "WRITE") + addNfsStat(&pvNameStatMapping, pvName, operation, "WRITE") } } - return pvNameStatMapping, nil + return pvServerMapping, pvNameStatMapping, nil } -func addNfsStat(pvNameStatMapping *map[string][]string, mountPath string, operationStat NFSOperationStats, keyWord string) { +func getPVName(mount *Mount) string { + if mount == nil { + return "" + } + paths := strings.Split(mount.Mount, "/") + if len(paths) < 2 { + return "" + } + return paths[len(paths)-2] +} + +func addNfsStat(pvNameStatMapping *map[string][]string, pvName string, operationStat NFSOperationStats, keyWord string) { if operationStat.Operation == keyWord { - pathArr := strings.Split(mountPath, "/") - pvName := pathArr[len(pathArr)-2] if len((*pvNameStatMapping)[pvName]) >= NFSMetricsCount { return } @@ -417,47 +289,3 @@ func addNfsStat(pvNameStatMapping *map[string][]string, mountPath string, operat (*pvNameStatMapping)[pvName] = append((*pvNameStatMapping)[pvName], strconv.Itoa(int(operationStat.CumulativeTotalRequestMilliseconds))) } } - -func getNfsCapacityStat(pvName string, info nfsInfo, p *nfsStatCollector) ([]string, error) { - capacityInfo, err := p.monitorClient.GetNasCapacityInfo(pvName) - if err != nil { - return nil, err - } - if capacityInfo != nil && capacityInfo.TotalSize != -1 && capacityInfo.UsedSize != -1 { - total, used := getTotalAndUsedSize(capacityInfo) - p.capacityEventAlert(total, used, pvName, info) - return []string{ - strconv.FormatInt(total, 10), - strconv.FormatInt(used, 10), - strconv.FormatInt(total-used, 10), - }, nil - } - // NOTE: The system is unable to extract the capacity statistics because the capacity of the NFS mount point is based - // on the overall quota of the NAS filesystem, rather than the specific path that is mounted. - return nil, errors.New("capacity metrics from storage-monitor missing or invalid") -} - -func getTotalAndUsedSize(info *nfsCapacityInfo) (int64, int64) { - if info.TotalSizeInBytes != nil && info.UsedSizeInBytes != nil { - return *info.TotalSizeInBytes, *info.UsedSizeInBytes - } - return info.TotalSize * GiBSize, info.UsedSize * GiBSize -} - -func (p *nfsStatCollector) capacityEventAlert(totalSize int64, usedSize int64, pvName string, info nfsInfo) { - total, used, gibSize := float64(totalSize), float64(usedSize), float64(GiBSize) - if p.capacityPercentageThreshold > 0 { - usedPercentage := 100 * used / total - if usedPercentage >= float64(p.capacityPercentageThreshold) { - ref := &v1.ObjectReference{ - Kind: "PersistentVolumeClaim", - Name: info.PvcName, - UID: "", - Namespace: info.PvcNamespace, - } - reason := fmt.Sprintf("PVC %s/%s (PV %s) is running out of capacity, totalSize:%fGi, usedSize:%fGi, usedPercentage:%.2f%%, threshold:%.2f%%", - info.PvcNamespace, info.PvcName, pvName, total/gibSize, used/gibSize, usedPercentage, p.capacityPercentageThreshold) - utils.CreateEvent(p.recorder, ref, v1.EventTypeWarning, capacityNotEnough, reason) - } - } -} diff --git a/pkg/metric/pfs_stat_collector.go b/pkg/metric/pfs_stat_collector.go index d9fa64c69a..6c69715744 100644 --- a/pkg/metric/pfs_stat_collector.go +++ b/pkg/metric/pfs_stat_collector.go @@ -21,6 +21,10 @@ func NewPfsRawBlockStatCollector() (Collector, error) { return &pfsRawBlockStatCollector{}, nil } +func (p *pfsRawBlockStatCollector) Get() []*Metric { + return nil +} + func (p *pfsRawBlockStatCollector) Update(ch chan<- prometheus.Metric) error { return nil } diff --git a/pkg/metric/types.go b/pkg/metric/types.go new file mode 100644 index 0000000000..b1d8cc1830 --- /dev/null +++ b/pkg/metric/types.go @@ -0,0 +1,119 @@ +package metric + +import ( + "fmt" + "sort" + + "github.com/prometheus/client_golang/prometheus" + promdto "github.com/prometheus/client_model/go" + "google.golang.org/protobuf/proto" +) + +type Metric struct { + *MetaDesc + Value float64 + ValueType prometheus.ValueType + VariableLabelPairs []*promdto.LabelPair +} + +type MetaDesc struct { + *prometheus.Desc + FQName string + Help string + ConstLabelPairs []*promdto.LabelPair + VariableLabels []string +} + +func NewMetaDesc(namespace, subsystem, name, help string, variableLabels []string, constLabels prometheus.Labels) (res *MetaDesc) { + fqName := prometheus.BuildFQName(namespace, subsystem, name) + desc := prometheus.NewDesc(fqName, help, variableLabels, constLabels) + res = &MetaDesc{ + Desc: desc, + FQName: fqName, + Help: help, + VariableLabels: variableLabels, + } + res.ConstLabelPairs = make([]*promdto.LabelPair, 0, len(constLabels)) + for k, v := range constLabels { + res.ConstLabelPairs = append(res.ConstLabelPairs, &promdto.LabelPair{ + Name: proto.String(k), + Value: proto.String(v), + }) + } + sort.Sort(LabelPairSorter(res.ConstLabelPairs)) + return +} + +func MustNewMetricWithTypedFactorDesc(desc typedFactorDesc, value float64, labelValues ...string) *Metric { + if desc.factor != 0 { + value *= desc.factor + } + return MustNewMetricWithMetaDesc(desc.MetaDesc, value, desc.valueType, labelValues...) +} + +func MustNewMetricWithMetaDesc(desc *MetaDesc, value float64, valueType prometheus.ValueType, labelValues ...string) *Metric { + return MustNewMetric(desc, value, valueType, labelValues...) +} + +func MustNewMetric(desc *MetaDesc, value float64, valueType prometheus.ValueType, labelValues ...string) *Metric { + metric, err := NewMetric(desc, value, valueType, labelValues...) + if err != nil { + panic(err) + } + return metric +} + +func NewMetricWithTypedFactorDesc(desc typedFactorDesc, value float64, labelValues ...string) (*Metric, error) { + if desc.factor != 0 { + value *= desc.factor + } + return NewMetric(desc.MetaDesc, value, desc.valueType, labelValues...) +} + +func NewMetric(desc *MetaDesc, value float64, valueType prometheus.ValueType, labelValues ...string) (*Metric, error) { + if desc == nil { + return nil, fmt.Errorf("desc cannot be nil") + } + if len(desc.VariableLabels) != len(labelValues) { + return nil, fmt.Errorf("labels and labelValues must have the same length") + } + + pairs, err := makeLabelPairs(desc.VariableLabels, labelValues...) + if err != nil { + return nil, err + } + + return &Metric{ + MetaDesc: desc, + VariableLabelPairs: pairs, + Value: value, + ValueType: valueType, + }, nil +} + +func makeLabelPairs(labels []string, labelValues ...string) (pairs []*promdto.LabelPair, err error) { + if len(labels) != len(labelValues) { + return nil, fmt.Errorf("labels and labelValues must have the same length") + } + for i, label := range labels { + pairs = append(pairs, &promdto.LabelPair{ + Name: &label, + Value: &labelValues[i], + }) + } + return pairs, nil +} + +type LabelPairSorter []*promdto.LabelPair + +func (s LabelPairSorter) Len() int { + return len(s) +} + +func (s LabelPairSorter) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s LabelPairSorter) Less(i, j int) bool { + return s[i].GetName() < s[j].GetName() +} diff --git a/pkg/metric/utils.go b/pkg/metric/utils.go index f3e21d4948..a701c97cc4 100644 --- a/pkg/metric/utils.go +++ b/pkg/metric/utils.go @@ -4,6 +4,7 @@ import ( "bufio" "context" "crypto/sha256" + "errors" "fmt" "os" @@ -11,11 +12,13 @@ import ( "strings" "sync" - "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/cnfs/v1beta1" + promdto "github.com/prometheus/client_model/go" + + "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/options" "github.com/kubernetes-sigs/alibaba-cloud-csi-driver/pkg/utils" + "github.com/prometheus/client_golang/prometheus" apicorev1 "k8s.io/api/core/v1" apismetav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" ) @@ -25,6 +28,19 @@ var isVF = false const containerNetworkFileSystem = "containerNetworkFileSystem" +func newK8sClient() (kubernetes.Interface, error) { + config, err := options.GetRestConfig() + if err != nil { + klog.ErrorS(err, "Failed to get rest config") + return nil, nil + } + client, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, err + } + return client, nil +} + func readFirstLines(path string) ([]string, error) { file, err := os.Open(path) if err != nil { @@ -56,8 +72,11 @@ func readAllContent(path string) (string, error) { return result, nil } -func getDiskPvcByPvName(clientSet *kubernetes.Clientset, pvName string) (*apicorev1.ObjectReference, error) { - pv, err := clientSet.CoreV1().PersistentVolumes().Get(context.Background(), pvName, apismetav1.GetOptions{}) +func getDiskPvcByPvName(client kubernetes.Interface, pvName, volDataPath string) (*apicorev1.ObjectReference, error) { + if client == nil { + return getPvcByVolData(volDataPath) + } + pv, err := client.CoreV1().PersistentVolumes().Get(context.Background(), pvName, apismetav1.GetOptions{}) if err != nil { return nil, err } @@ -67,26 +86,36 @@ func getDiskPvcByPvName(clientSet *kubernetes.Clientset, pvName string) (*apicor return nil, errors.New("pvName:" + pv.Name + " status is not bound.") } -func getNasPvcByPvName(clientSet *kubernetes.Clientset, cnfsClient dynamic.Interface, pvName string) (string, string, string, error) { - pv, err := clientSet.CoreV1().PersistentVolumes().Get(context.Background(), pvName, apismetav1.GetOptions{}) +func getPvcByVolData(volDataPath string) (*apicorev1.ObjectReference, error) { + volDataMap, err := utils.ReadJSONFile(volDataPath) + klog.InfoS("Volume data map", "map", volDataMap) if err != nil { - return "", "", "", err + return nil, err } - if pv.Spec.CSI != nil { - if val, ok := pv.Spec.CSI.VolumeAttributes["server"]; ok { - if pv.Status.Phase == apicorev1.VolumeBound { - return pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name, val, nil - } - } else if value, ok := pv.Spec.CSI.VolumeAttributes[containerNetworkFileSystem]; ok { - cnfs, err := v1beta1.GetCnfsObject(cnfsClient, value) - if err != nil { - klog.Errorf("Get cnfs %s server is failed, err:%s", value, err) - return "", "", "", err - } - return pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name, cnfs.Status.FsAttributes.Server, nil + return &apicorev1.ObjectReference{ + Namespace: volDataMap["csi.alibabacloud.com/pvc-namespace"], + Name: volDataMap["csi.alibabacloud.com/pvc-name"], + }, nil +} + +func getNasPvcByPvName(client kubernetes.Interface, pvName, volDataPath string) (string, string, error) { + if client == nil { + pvc, err := getPvcByVolData(volDataPath) + klog.InfoS("getNasPvcByPvName", "pvc", pvc) + if err != nil { + return "", "", err } + return pvc.Namespace, pvc.Name, nil } - return "", "", "", errors.New("pvName:" + pv.Name + " status is not bound.") + + pv, err := client.CoreV1().PersistentVolumes().Get(context.Background(), pvName, apismetav1.GetOptions{}) + if err != nil { + return "", "", err + } + if pv.Spec.CSI != nil { + return pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name, nil + } + return "", "", errors.New("pvName:" + pv.Name + " status is not bound.") } var ErrUnexpectedVolumeType = errors.New("VolumeType is not the expected type") @@ -154,3 +183,64 @@ func getGlobalMountPathByDiskID(diskID string) string { hash := sha256.Sum256([]byte(diskID)) return filepath.Join(utils.KubeletRootDir, fmt.Sprintf("/plugins/kubernetes.io/csi/%s/%x/globalmount", diskDriverName, hash)) } + +func extractMetricsFromMetricVec(fqName, help string, metricVec prometheus.Collector, valueType prometheus.ValueType) (metrics []*Metric) { + ch := make(chan prometheus.Metric) + go func() { + metricVec.Collect(ch) + close(ch) + }() + + for metric := range ch { + desc := metric.Desc() + + gauge := &promdto.Metric{} + if err := metric.Write(gauge); err != nil { + klog.ErrorS(err, "Failed to write metric", "desc", desc) + continue + } + + value, err := getMetricValue(gauge, valueType) + if err != nil { + klog.ErrorS(err, "Failed to get metric value", "desc", desc) + continue + } + + metrics = append(metrics, &Metric{ + MetaDesc: &MetaDesc{ + Desc: desc, + FQName: fqName, + Help: help, + }, + VariableLabelPairs: gauge.Label, + Value: value, + ValueType: valueType, + }) + } + return metrics +} + +func getMetricValue(metric *promdto.Metric, valueType prometheus.ValueType) (float64, error) { + if metric == nil { + return 0, nil + } + switch valueType { + case prometheus.CounterValue: + if metric.Counter == nil || metric.Counter.Value == nil { + return 0, errors.New("nil metric counter") + } + return *metric.Counter.Value, nil + case prometheus.GaugeValue: + if metric.Gauge == nil { + return 0, errors.New("nil metric gauge") + } + return *metric.Gauge.Value, nil + case prometheus.UntypedValue: + if metric.Untyped == nil { + return 0, errors.New("nil metric untyped") + } + return *metric.Untyped.Value, nil + default: + return 0, fmt.Errorf("unsupported value type: %s", valueType.ToDTO().String()) + } +} diff --git a/pkg/metric/volume_stat_collector.go b/pkg/metric/volume_stat_collector.go index b26e948181..8eded1fed8 100644 --- a/pkg/metric/volume_stat_collector.go +++ b/pkg/metric/volume_stat_collector.go @@ -18,18 +18,28 @@ type volumeStatCollector struct { const VolumeAttachTimeStat VolumeStatType = 0 +var ( + attachmentCountName = "attachment_count" + attachmentCountFQName = prometheus.BuildFQName(nodeNamespace, volumeSubsystem, attachmentCountName) + attachmentCountHelp = "Volume attachment count." + + attachmentTimeTotalName = "attachment_time_total" + attachmentTimeTotalFQName = prometheus.BuildFQName(nodeNamespace, volumeSubsystem, attachmentTimeTotalName) + attachmentTimeTotalHelp = "Volume attachment time in total." +) + var VolumeStatCollector = volumeStatCollector{ AttachmentCountMetric: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: nodeNamespace, Subsystem: volumeSubsystem, - Name: "attachment_count", - Help: "Volume attachment count.", + Name: attachmentCountName, + Help: attachmentCountHelp, }, volumeStatLabels), AttachmentTimeTotalMetric: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: nodeNamespace, Subsystem: volumeSubsystem, - Name: "attachment_time_total", - Help: "Volume attachment time in total.", + Name: attachmentTimeTotalName, + Help: attachmentTimeTotalHelp, }, volumeStatLabels), } @@ -41,6 +51,12 @@ func GetVolumeStatCollector() (Collector, error) { return &VolumeStatCollector, nil } +func (c *volumeStatCollector) Get() []*Metric { + countMetrics := extractMetricsFromMetricVec(attachmentCountFQName, attachmentCountHelp, c.AttachmentCountMetric, prometheus.CounterValue) + timeMetrics := extractMetricsFromMetricVec(attachmentTimeTotalFQName, attachmentTimeTotalHelp, c.AttachmentTimeTotalMetric, prometheus.CounterValue) + return append(countMetrics, timeMetrics...) +} + func (c *volumeStatCollector) Update(ch chan<- prometheus.Metric) error { c.AttachmentCountMetric.Collect(ch) c.AttachmentTimeTotalMetric.Collect(ch) diff --git a/pkg/nas/nodeserver.go b/pkg/nas/nodeserver.go index 3525497c38..88fb5c0153 100644 --- a/pkg/nas/nodeserver.go +++ b/pkg/nas/nodeserver.go @@ -446,7 +446,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis return nil, status.Error(codes.Internal, err.Error()) } if opt.MountProtocol == "efc" { - metricsPathPrefix := getMetricsPathPrefix() + metricsPathPrefix := getMetricsPathPrefix(ns.config.AgentMode) if strings.Contains(opt.Server, ".nas.aliyuncs.com") { fsID := getNASIDFromMapOrServer(req.VolumeContext, opt.Server) if len(fsID) != 0 { @@ -498,11 +498,15 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis return &csi.NodePublishVolumeResponse{}, nil } -func getMetricsPathPrefix() string { - if features.FunctionalMutableFeatureGate.Enabled(features.AlinasMountProxy) { - return "/run/cnfs/efc/" +func getMetricsPathPrefix(agentMode bool) string { + switch { + case agentMode: + return "/var/run/efc" + case features.FunctionalMutableFeatureGate.Enabled(features.AlinasMountProxy): + return "/run/cnfs/efc" + default: + return "/host/var/run/efc/" } - return "/host/var/run/efc/" } func isValidServer(server string) bool { diff --git a/pkg/oss/nodeserver.go b/pkg/oss/nodeserver.go index b8320bb013..2b2520c2e4 100644 --- a/pkg/oss/nodeserver.go +++ b/pkg/oss/nodeserver.go @@ -50,9 +50,11 @@ type nodeServer struct { skipAttach bool } -const ( - // metricsPathPrefix - metricsPathPrefix = "/host/var/run/ossfs/" +var ( + possibleMetricsPathPrefixes = []string{ + "/host/var/run/ossfs/", + "/var/run/ossfs/", + } ) // for cases where fuseType does not affect like UnPublishVolume, @@ -240,6 +242,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis // - If mount point exists and token rotation is needed: update token files and return ErrSkipMount // - If mount point doesn't exist: proceed with normal mount + metricsPathPrefix := getMetricsPathPrefixByRuntimeType(runtimeType) // When work as csi-agent, directly mount on the target path. if runtimeType == RuntimeTypeRunD || runtimeType == RuntimeTypeMicroVM { var metricsPath string @@ -313,6 +316,13 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis return &csi.NodePublishVolumeResponse{}, nil } +func getMetricsPathPrefixByRuntimeType(runtimeType RuntimeType) string { + if runtimeType == RuntimeTypeMicroVM { + return possibleMetricsPathPrefixes[1] + } + return possibleMetricsPathPrefixes[0] +} + func validateNodeUnpublishVolumeRequest(req *csi.NodeUnpublishVolumeRequest) error { valid, err := utils.ValidatePath(req.GetTargetPath()) if !valid { @@ -368,7 +378,9 @@ func (ns *nodeServer) NodeUnstageVolume( } // The metricsPath in fuse Pod will be cleaned and not allowed to update the metrics - utils.RemoveMetrics(metricsPathPrefix, req) + if metricsPathPrefix := getExistingMetricsPathPrefix(); metricsPathPrefix != "" { + utils.RemoveMetrics(metricsPathPrefix, req) + } // In the legacy mount process, NodePublishVolume creates ossfs pods in kube-system namespace to mount ossfpm. // We still need to umount the mountpoint in case csi-plugin is upgraded from these versions. @@ -387,6 +399,15 @@ func (ns *nodeServer) NodeUnstageVolume( return &csi.NodeUnstageVolumeResponse{}, nil } +func getExistingMetricsPathPrefix() string { + for _, path := range possibleMetricsPathPrefixes { + if utils.IsFileExisting(path) { + return path + } + } + return "" +} + type publishRequest interface { GetVolumeCapability() *csi.VolumeCapability GetReadonly() bool