From d33b5a391f4a18b3d9c0a194e57cdd3491130099 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" <rjohnson@coreweave.com> Date: Tue, 5 Mar 2024 23:03:28 -0800 Subject: [PATCH] fix: correct smartctl_device_bytes_written & smartctl_device_bytes_read for NVMe The NVMe specification says that the controller is responsible for reporting "Data Units Read" & "Data Units Written" converted as needed for logicial block sizes other than 512-bytes. smartmontools already has the correct behavior. What is correct in this case? For now, track what smartmontools does: take the counter, multiply by 512*1000, report the value. We should be clear that it means the drive has read/written at most that many bytes. This has a few impacts: - NVME devices will now show these metrics, if they did not before. - NVME devices with blocksize other than 512-bytes may have previously reported inflated metrics, but are now corrected (is this worthy of larger notice in changelogs?) Reference: https://github.com/smartmontools/smartmontools/blob/11415ee0b9d5f4a22ddfb3722fdfb05e72372a03/smartmontools/nvmeprint.cpp#L394-L397 Closes: https://github.com/prometheus-community/smartctl_exporter/issues/122 Signed-off-by: Robin H. Johnson <rjohnson@coreweave.com> --- smartctl.go | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/smartctl.go b/smartctl.go index d308d63..05d867e 100644 --- a/smartctl.go +++ b/smartctl.go @@ -370,34 +370,57 @@ func (smart *SMARTctl) mineNvmeNumErrLogEntries() { ) } +// https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf +// 4.1.4.2 SMART / Health Information (02h) +// The SMART / Health Information log page is as defined in the NVM Express Base Specification. For the +// Data Units Read and Data Units Written fields, when the logical block size is a value other than 512 bytes, +// the controller shall convert the amount of data read to 512 byte units. + +// https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-2.0d-2024.01.11-Ratified.pdf +// Figure 208: SMART / Health Information Log Page +// Bytes 47:32 +// Data Units Read: Contains the number of 512 byte data units the host has read from the +// controller as part of processing a SMART Data Units Read Command; this value does not +// include metadata. This value is reported in thousands (i.e., a value of 1 corresponds to 1,000 +// units of 512 bytes read) and is rounded up (e.g., one indicates that the number of 512 byte +// data units read is from 1 to 1,000, three indicates that the number of 512 byte data units read +// is from 2,001 to 3,000). +// +// A value of 0h in this field indicates that the number of SMART Data Units Read is not reported. +// +// Bytes 63:48 +// +// Data Units Written: Contains the number of 512 byte data units the host has written to the ... +// (the same as Data Units Read) + func (smart *SMARTctl) mineNvmeBytesRead() { - blockSize := smart.json.Get("logical_block_size") data_units_read := smart.json.Get("nvme_smart_health_information_log.data_units_read") - if !blockSize.Exists() || !data_units_read.Exists() { + // 0 => not reported by underlying hardware + if !data_units_read.Exists() || data_units_read.Int() == 0 { return } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesRead, prometheus.CounterValue, - // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. - // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - data_units_read.Float()*1000.0*blockSize.Float(), + // WARNING: Float64 will lose precision when drives reach ~32EiB read/write + // The underlying data_units_written,data_units_read are 128-bit integers + data_units_read.Float()*1000.0*512.0, smart.device.device, ) } func (smart *SMARTctl) mineNvmeBytesWritten() { - blockSize := smart.json.Get("logical_block_size") data_units_written := smart.json.Get("nvme_smart_health_information_log.data_units_written") - if !blockSize.Exists() || !data_units_written.Exists() { + // 0 => not reported by underlying hardware + if !data_units_written.Exists() || data_units_written.Int() == 0 { return } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesWritten, prometheus.CounterValue, - // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. - // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - data_units_written.Float()*1000.0*blockSize.Float(), + // WARNING: Float64 will lose precision when drives reach ~32EiB read/write + // The underlying data_units_written,data_units_read are 128-bit integers + data_units_written.Float()*1000.0*512.0, smart.device.device, ) }