[smartmontools-support] [PATCH] RFC: libata: Add hwmon support for SMART temperature sensors
Guenter Roeck
linux at roeck-us.net
Fri Aug 10 06:15:42 CEST 2018
On 08/09/2018 03:24 PM, Linus Walleij wrote:
> S.M.A.R.T. temperature sensors have been supported for
> years by userspace tools such as smarttools.
>
> The temperature readout is however also a good fit for
> Linux' hwmon subsystem. By adding a hwmon interface to dig
> out SMART parameter 194, we can expose the drive temperature
> as a standard hwmon sensor.
>
> The idea came about when experimenting with NAS enclosures
> that lack their own on-board sensors but instead piggy-back
> the sensor found in the harddrive, if any, to decide on a
> policy for driving the on-board fan.
>
> The kernel thermal subsystem supports defining a thermal
> policy for the enclosure using the device tree, see e.g.:
> arch/arm/boot/dts/gemini-dlink-dns-313.dts
> but this requires a proper hwmon sensor integrated with
> the kernel.
>
> This is a first attempt at providing a kernel-internal
> hwmon sensor for ATA drives. It is possible to do the
> same for SCSI, NVME etc, but their protocols and
> peculiarities seem to require a per-subsystem implementation.
> They would all end up in the same namespace using the
> SCSI name such as "sd_0:0:0:0".
>
If the implementation for other drive types needs to be different,
how about "ata" as prefix ?
> With this driver, the hard disk temperatur can be read from
temperature
> sysfs:
>
> > cd /sys/class/hwmon/hwmon0/
> > cat temp1_input
> 38
>
Did you try the "sensors" command ?
> This likely means that they can also be handled by
> userspace tools such as lm_sensors in a uniform way
> without need for any special tools such as "hddtemp"
> (which seems dormant) though I haven't tested it.
>
> This driver does not block any simultaneous use of
> other SMART userspace tools, it's a both/and approach,
> not either/or.
>
> Signed-off-by: Linus Walleij <linus.walleij at linaro.org>
> ---
> This is just me having some idea, so I wanted to toss it
> out there in case people think it is useful. If you want
> to kill the idea right now before I get any further with
> it, this is the time to pitch in. You can also say if
> you like it.
>
For my part I like it. Couple of comments below.
> I included the smartmontools mailing list on the review,
> it seemed relevant.
> ---
> drivers/ata/Kconfig | 13 ++
> drivers/ata/Makefile | 1 +
> drivers/ata/libata-hwmon.c | 420 +++++++++++++++++++++++++++++++++++++
> drivers/ata/libata-hwmon.h | 15 ++
> drivers/ata/libata-scsi.c | 2 +
> 5 files changed, 451 insertions(+)
> create mode 100644 drivers/ata/libata-hwmon.c
> create mode 100644 drivers/ata/libata-hwmon.h
>
> diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
> index 2b16e7c8fff3..8349101c7e53 100644
> --- a/drivers/ata/Kconfig
> +++ b/drivers/ata/Kconfig
> @@ -59,6 +59,19 @@ config ATA_ACPI
> You can disable this at kernel boot time by using the
> option libata.noacpi=1
>
> +config ATA_HWMON
> + bool "ATA S.M.A.R.T. HWMON support"
> + depends on HWMON
> + help
> + This options compiles in code to support temperature reading
> + from an ATA device using the S.M.A.R.T. (Self-Monitoring,
> + Analysis and Reporting Technology) support for temperature
> + sensors found in some hard drives. The drive will be probed
> + to figure out if it has a temperature sensor, and if it does
> + the kernel hardware monitor framework will be utilized to
> + interact with the sensor. This work orthogonal to any userspace
> + S.M.A.R.T. access tools.
> +
> config SATA_ZPODD
> bool "SATA Zero Power Optical Disc Drive (ZPODD) support"
> depends on ATA_ACPI && PM
> diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
> index d21cdd83f7ab..7a22b27c66c0 100644
> --- a/drivers/ata/Makefile
> +++ b/drivers/ata/Makefile
> @@ -126,3 +126,4 @@ libata-$(CONFIG_ATA_SFF) += libata-sff.o
> libata-$(CONFIG_SATA_PMP) += libata-pmp.o
> libata-$(CONFIG_ATA_ACPI) += libata-acpi.o
> libata-$(CONFIG_SATA_ZPODD) += libata-zpodd.o
> +libata-$(CONFIG_ATA_HWMON) += libata-hwmon.o
> diff --git a/drivers/ata/libata-hwmon.c b/drivers/ata/libata-hwmon.c
> new file mode 100644
> index 000000000000..fa1e4e472625
> --- /dev/null
> +++ b/drivers/ata/libata-hwmon.c
> @@ -0,0 +1,420 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Hwmon client for ATA S.M.A.R.T. hard disk drivers
> + * (C) 2018 Linus Walleij
> + *
> + * This code is based on know-how and examples from the
> + * smartmontools by Bruce Allen, Christian Franke et al.
> + * (C) 2002-2018
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/of.h>
> +#include <linux/slab.h>
> +#include <linux/hwmon.h>
> +#include <linux/ata.h>
> +#include <scsi/scsi_device.h>
> +#include <scsi/scsi_cmnd.h>
> +
Alphabetic oder
> +#include "libata-hwmon.h"
> +
> +#define ATA_MAX_SMART_ATTRS 30
> +#define SMART_TEMP_PROP_194 194
> +
> +enum ata_temp_format {
> + ATA_TEMP_FMT_TT_XX_00_00_00_00,
> + ATA_TEMP_FMT_TT_XX_LL_HH_00_00,
> + ATA_TEMP_FMT_TT_LL_HH_00_00_00,
> + ATA_TEMP_FMT_TT_XX_LL_XX_HH_XX,
> + ATA_TEMP_FMT_TT_XX_HH_XX_LL_XX,
> + ATA_TEMP_FMT_TT_XX_LL_HH_CC_CC,
> + ATA_TEMP_FMT_UNKNOWN,
> +};
> +
> +/**
> + * struct ata_hwmon - device instance state
> + * @hwmon_dev: associated hwmon device
> + */
> +struct ata_hwmon {
> + struct device *dev;
> + struct device *hwmon_dev;
> + struct scsi_device *sdev;
> + enum ata_temp_format tfmt;
> +};
> +
> +static umode_t ata_hwmon_is_visible(const void *data,
> + enum hwmon_sensor_types type,
> + u32 attr, int channel)
> +{
> + switch (type) {
> + case hwmon_temp:
> + switch (attr) {
> + case hwmon_temp_input:
> + case hwmon_temp_min:
> + case hwmon_temp_max:
> + return S_IRUGO;
Discouraged nowadays. You'll get a checkpatch warning for that.
> + }
> + break;
> + default:
> + break;
> + }
> + return 0;
> +}
> +
> +static int check_temp_word(u16 word)
> +{
> + if (word <= 0x7f)
> + return 0x11; /* >= 0, signed byte or word */
> + if (word <= 0xff)
> + return 0x01; /* < 0, signed byte */
> + if (word > 0xff80)
> + return 0x10; /* < 0, signed word */
> + return 0x00;
> +}
> +
> +static bool ata_check_temp_range(int t, u8 t1, u8 t2)
> +{
> + int lo = (s8)t1;
> + int hi = (s8)t2;
> +
> + /* This is obviously wrong */
> + if (lo > hi)
> + return false;
> +
> + /*
> + * If -60 <= lo <= t <= hi <= 120 and
> + * and NOT lo == -1 and hi <= 0, then we have valid lo and hi
> + */
> + if (-60 <= lo && lo <= t && t <= hi && hi <= 120
> + && !(lo == -1 && hi <= 0)) {
I really dislike Yoda programming, and double negations.
> + return true;
> + }
> + return false;
This can just return the negated expression, and I am quite sure it can be simplified.
return !(the entire expression);
> +}
> +
> +static int ata_hwmon_detect_tempformat(struct ata_hwmon *ata, u8 *raw)
> +{
> + s8 t;
> + u16 w0, w1, w2;
> + int ctw0;
> +
> + /*
> + * Interpret the RAW temperature data:
> + * raw[0] is the temperature given as signed u8 on all known drives
> + *
> + * Search for possible min/max values
> + * This algorithm is a modified version from the smartmontools.
> + *
> + * [0][1][2][3][4][5] raw[]
> + * [ 0 ] [ 1 ] [ 2 ] word[]
> + * TT xx LL xx HH xx Hitachi/HGST
> + * TT xx HH xx LL xx Kingston SSDs
> + * TT xx LL HH 00 00 Maxtor, Samsung, Seagate, Toshiba
> + * TT LL HH 00 00 00 WDC
> + * TT xx LL HH CC CC WDC, CCCC=over temperature count
> + * (xx = 00/ff, possibly sign extension of lower byte)
> + *
> + * TODO: detect the 10x temperatures found on some Samsung
> + * drives. struct scsi_device contains manufacturer and model
> + * information.
> + */
> + w0 = raw[0] | raw[1] << 16;
> + w1 = raw[2] | raw[3] << 16;
> + w2 = raw[4] | raw[5] << 16;
> + t = (s8)raw[0];
> +
> + /* If this is != 0, then w0 may contain something useful */
> + ctw0 = check_temp_word(w0);
> +
> + /* This checks variants with zero in [4] [5] */
> + if (!w2) {
> + /* TT xx 00 00 00 00 */
> + if (!w1 && ctw0)
> + ata->tfmt = ATA_TEMP_FMT_TT_XX_00_00_00_00;
> + /* TT xx LL HH 00 00 */
> + else if (ctw0 &&
> + ata_check_temp_range(t, raw[2], raw[3]))
> + ata->tfmt = ATA_TEMP_FMT_TT_XX_LL_HH_00_00;
> + /* TT LL HH 00 00 00 */
> + else if (!raw[3] &&
> + ata_check_temp_range(t, raw[1], raw[2]))
> + ata->tfmt = ATA_TEMP_FMT_TT_LL_HH_00_00_00;
> + else
> + return -EINVAL;
> + } else if (ctw0) {
> + /*
> + * TT xx LL xx HH xx
> + * What the expression below does is to check that each word
> + * formed by [0][1], [2][3], and [4][5] is something little-
> + * endian s8 or s16 that could be meaningful.
> + */
> + if ((ctw0 & check_temp_word(w1) & check_temp_word(w2)) != 0x00)
> + if (ata_check_temp_range(t, raw[2], raw[4]))
> + ata->tfmt = ATA_TEMP_FMT_TT_XX_LL_XX_HH_XX;
> + else if (ata_check_temp_range(t, raw[4], raw[2]))
> + ata->tfmt = ATA_TEMP_FMT_TT_XX_HH_XX_LL_XX;
> + else
> + return -EINVAL;
> + /*
> + * TT xx LL HH CC CC
> + * Make sure the CC CC word is at least not negative, and that
> + * the max temperature is something >= 40, then it is probably
> + * the right format.
> + */
> + else if (w2 < 0x7fff) {
> + if (ata_check_temp_range(t, raw[2], raw[3]) &&
> + raw[3] >= 40)
> + ata->tfmt = ATA_TEMP_FMT_TT_XX_LL_HH_CC_CC;
> + else
> + return -EINVAL;
> + } else {
> + return -EINVAL;
> + }
> + } else {
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static void ata_hwmon_convert_temperatures(struct ata_hwmon *ata, u8 *raw,
> + int *t, int *lo, int *hi)
> +{
> + *t = (s8)raw[0];
> +
> + switch (ata->tfmt) {
> + case ATA_TEMP_FMT_TT_XX_00_00_00_00:
> + *lo = 0;
> + *hi = 0;
> + break;
> + case ATA_TEMP_FMT_TT_XX_LL_HH_00_00:
> + *lo = (s8)raw[2];
> + *hi = (s8)raw[3];
> + break;
> + case ATA_TEMP_FMT_TT_LL_HH_00_00_00:
> + *lo = (s8)raw[1];
> + *hi = (s8)raw[2];
> + break;
> + case ATA_TEMP_FMT_TT_XX_LL_XX_HH_XX:
> + *lo = (s8)raw[2];
> + *hi = (s8)raw[4];
> + break;
> + case ATA_TEMP_FMT_TT_XX_HH_XX_LL_XX:
> + *lo = (s8)raw[4];
> + *hi = (s8)raw[2];
> + break;
> + case ATA_TEMP_FMT_TT_XX_LL_HH_CC_CC:
> + *lo = (s8)raw[2];
> + *hi = (s8)raw[3];
> + break;
> + case ATA_TEMP_FMT_UNKNOWN:
> + *lo = 0;
> + *hi = 0;
> + break;
> + }
> +}
> +
> +static int ata_hwmon_read_temp(struct ata_hwmon *ata, int *temp,
> + int *min, int *max)
> +{
> + u8 scsi_cmd[MAX_COMMAND_SIZE];
> + int cmd_result;
> + u8 *argbuf = NULL;
> + struct scsi_sense_hdr sshdr;
> + u8 raw[6];
> + int ret;
> + u8 csum;
> + int i;
> +
> + /* Send ATA command to read SMART values */
> + memset(scsi_cmd, 0, sizeof(scsi_cmd));
> + scsi_cmd[0] = ATA_16;
> + scsi_cmd[1] = (4 << 1); /* PIO Data-in */
> + /*
> + * No off.line or cc, read from dev, block count in sector count
> + * field.
> + */
> + scsi_cmd[2] = 0x0e;
> + scsi_cmd[4] = ATA_SMART_READ_VALUES;
> + scsi_cmd[6] = 1; /* Read 1 sector */
> + scsi_cmd[8] = 0; /* args[1]; */
> + scsi_cmd[10] = ATA_SMART_LBAM_PASS;
> + scsi_cmd[12] = ATA_SMART_LBAH_PASS;
> + scsi_cmd[14] = ATA_CMD_SMART;
> +
> + argbuf = kmalloc(ATA_SECT_SIZE, GFP_KERNEL);
Hmm ... how about a fixed allocation in struct ata_hwmon ? Those repeated allocations
just cause stress on the memory allocator.
> + cmd_result = scsi_execute(ata->sdev, scsi_cmd, DMA_FROM_DEVICE,
> + argbuf, ATA_SECT_SIZE,
> + NULL, &sshdr, (10*HZ), 5, 0, 0, NULL);
(10*HZ) -> 10 * HZ
Does it have to be that long ?
> + if (cmd_result) {
> + dev_err(ata->dev, "error %d reading SMART values from device\n",
> + cmd_result);
Are those error messages valuable ? To me they just clog the kernel log.
> + ret = -EIO;
> + goto freebuf;
> + }
> +
> + /* Checksum the read value table */
> + csum = 0;
> + for (i = 0; i < ATA_SECT_SIZE; i++)
> + csum += argbuf[i];
> + if (csum) {
> + dev_err(ata->dev, "checksum error reading SMART values\n");
> + ret = -EIO;
> + goto freebuf;
> + }
> +
> + /* Loop over SMART attributes */
> + for (i = 0; i < ATA_MAX_SMART_ATTRS; i++) {
> + u8 id;
> + u16 flags;
> + u8 curr;
> + u8 worst;
> + int j;
> +
> + id = argbuf[2 + i * 12];
> + if (!id)
> + continue;
> +
> + flags = argbuf[3 + i * 12] | (argbuf[4 + i * 12] << 16);
> + /* Highest temperature since boot */
> + curr = argbuf[5 + i * 12];
> + /* Highest temperature ever */
> + worst = argbuf[6 + i * 12];
One of those could be reported as temp1_highest (I would suggest the temperature
since boot).
> + for (j = 0; j < 6; j++)
> + raw[j] = argbuf[7 + i * 12 + j];
> + dev_dbg(ata->dev, "ID: %d, FLAGS: %04x, current %d, worst %d, "
> + "RAW %02x %02x %02x %02x %02x %02x\n",
> + id, flags, curr, worst,
> + raw[0], raw[1], raw[2], raw[3], raw[4], raw[5]);
> +
> + if (id == SMART_TEMP_PROP_194)
> + break;
> + }
> + if (i == ATA_MAX_SMART_ATTRS) {
> + ret = -ENOTSUPP;
> + goto freebuf;
> + }
> +
I would suggest to break out the above code ...
> + if (ata->tfmt == ATA_TEMP_FMT_UNKNOWN) {
> + ret = ata_hwmon_detect_tempformat(ata, raw);
> + if (ret) {
> + dev_err(ata->dev,
> + "unable to determine temperature format\n");
This is part of the probe function. I would suggest to just return
the error to avoid noise during boot for systems not supporting it.
Also, returning -EINVAL from the detect function only to replace it with
-ENOTSUPP doesn't really make much sense. Might as well just have the
function return -ENOTSUPP directly (and avoid the static checker
complaint that error messages are replaced).
> + ret = -ENOTSUPP;
> + goto freebuf;
> + }
> + }
> +
... and declare a separate function for the detection, to take this code path
entirely out of this function and call it explicitly from the probe function.
> + ata_hwmon_convert_temperatures(ata, raw, temp, min, max);
> + dev_dbg(ata->dev, "temp = %d, min = %d, max = %d\n",
> + *temp, *min, *max);
> +
> + ret = 0;
> +
> +freebuf:
> + kfree(argbuf);
> + return ret;
> +}
> +
> +static int ata_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
> + u32 attr, int channel, long *val)
> +{
> + struct ata_hwmon *ata = dev_get_drvdata(dev);
> + int temp, min, max;
> + int ret;
> +
> + if (type != hwmon_temp)
> + return -EINVAL;
> +
> + ret = ata_hwmon_read_temp(ata, &temp, &min, &max);
> + if (ret)
> + return ret;
> +
> + switch (attr) {
> + case hwmon_temp_input:
> + *val = temp;
> + break;
> + case hwmon_temp_min:
> + *val = min;
> + break;
> + case hwmon_temp_max:
> + *val = max;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static const struct hwmon_ops ata_hwmon_ops = {
> + .is_visible = ata_hwmon_is_visible,
> + .read = ata_hwmon_read,
> +};
> +
> +static const u32 ata_hwmon_temp_config[] = {
> + HWMON_T_INPUT | HWMON_T_MIN | HWMON_T_MAX,
> + 0,
> +};
> +
> +static const struct hwmon_channel_info ata_hwmon_temp = {
> + .type = hwmon_temp,
> + .config = ata_hwmon_temp_config,
> +};
> +
> +static const struct hwmon_channel_info *ata_hwmon_info[] = {
> + &ata_hwmon_temp,
> + NULL,
> +};
> +
> +static const struct hwmon_chip_info ata_hwmon_devinfo = {
> + .ops = &ata_hwmon_ops,
> + .info = ata_hwmon_info,
> +};
> +
> +int ata_hwmon_probe(struct scsi_device *sdev)
> +{
> + struct device *dev = &sdev->sdev_gendev;
> + struct ata_hwmon *ata;
> + char *sname;
> + int t;
> + int dummy;
> + int ret;
> +
> + ata = devm_kzalloc(dev, sizeof(*ata), GFP_KERNEL);
> + if (!ata)
> + return -ENOMEM;
> + ata->dev = dev;
> + ata->sdev = sdev;
> +
> + /*
> + * If temperature reading is not supported in the SMART
> + * properties, we just bail out.
> + */
> + ata->tfmt = ATA_TEMP_FMT_UNKNOWN;
> + ret = ata_hwmon_read_temp(ata, &t, &dummy, &dummy);
> + if (ret == -ENOTSUPP)
> + return 0;
> + /* Any other error, return upward */
> + if (ret)
> + return ret;
> + dev_info(dev, "initial temperature %d degrees celsius\n", t);
> +
noise
> + /* Names the hwmon device something like "sd_0:0:0:0" */
> + sname = devm_kasprintf(dev, GFP_KERNEL, "sd_%s", dev_name(dev));
> + if (!sname)
> + return -ENOMEM;
> + ata->hwmon_dev =
> + devm_hwmon_device_register_with_info(dev, sname, ata,
> + &ata_hwmon_devinfo,
> + NULL);
Is hwmon_dev needed in the data structure ?
> + if (IS_ERR(ata->hwmon_dev))
> + return PTR_ERR(ata->hwmon_dev);
> +
> + dev_info(dev, "added hwmon sensor %s\n", sname);
Noise. I would suggest
return IS_ERR_OR_ZERO(hwmon_dev);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(ata_hwmon_probe);
Unless I am missing something, that export should not be needed.
> diff --git a/drivers/ata/libata-hwmon.h b/drivers/ata/libata-hwmon.h
> new file mode 100644
> index 000000000000..df56ba456345
> --- /dev/null
> +++ b/drivers/ata/libata-hwmon.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <scsi/scsi_device.h>
> +
> +#ifdef CONFIG_ATA_HWMON
> +
> +int ata_hwmon_probe(struct scsi_device *sdev);
> +
> +#else
> +
> +static inline int ata_hwmon_probe(struct scsi_device *sdev)
> +{
> + return 0;
> +}
> +
> +#endif
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 55b890d19780..a83075e4d3b3 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -54,6 +54,7 @@
>
> #include "libata.h"
> #include "libata-transport.h"
> +#include "libata-hwmon.h"
>
> #define ATA_SCSI_RBUF_SIZE 4096
>
> @@ -4594,6 +4595,7 @@ void ata_scsi_scan_host(struct ata_port *ap, int sync)
> if (!IS_ERR(sdev)) {
> dev->sdev = sdev;
> scsi_device_put(sdev);
> + ata_hwmon_probe(sdev);
> } else {
> dev->sdev = NULL;
> }
>
More information about the Smartmontools-support
mailing list