]> Git Repo - linux.git/commitdiff
thermal: core: Allow thermal zones to tell the core to ignore them
authorRafael J. Wysocki <[email protected]>
Wed, 17 Jul 2024 19:45:02 +0000 (21:45 +0200)
committerRafael J. Wysocki <[email protected]>
Thu, 18 Jul 2024 11:35:55 +0000 (13:35 +0200)
The iwlwifi wireless driver registers a thermal zone that is only needed
when the network interface handled by it is up and it wants that thermal
zone to be effectively ignored by the core otherwise.

Before commit a8a261774466 ("thermal: core: Call monitor_thermal_zone()
if zone temperature is invalid") that could be achieved by returning
an error code from the thermal zone's .get_temp() callback because the
core did not really handle errors returned by it almost at all.
However, commit a8a261774466 made the core attempt to recover from the
situation in which the temperature of a thermal zone cannot be
determined due to errors returned by its .get_temp() and is always
invalid from the core's perspective.

That was done because there are thermal zones in which .get_temp()
returns errors to start with due to some difficulties related to the
initialization ordering, but then it will start to produce valid
temperature values at one point.

Unfortunately, the simple approach taken by commit a8a261774466,
which is to poll the thermal zone periodically until its .get_temp()
callback starts to return valid temperature values, is at odds with
the special thermal zone in iwlwifi in which .get_temp() may always
return an error because its network interface may always be down.  If
that happens, every attempt to invoke the thermal zone's .get_temp()
callback resulting in an error causes the thermal core to print a
dev_warn() message to the kernel log which is super-noisy.

To address this problem, make the core handle the case in which
.get_temp() returns 0, but the temperature value returned by it
is not actually valid, in a special way.  Namely, make the core
completely ignore the invalid temperature value coming from
.get_temp() in that case, which requires folding in
update_temperature() into its caller and a few related changes.

On the iwlwifi side, modify iwl_mvm_tzone_get_temp() to return 0
and put THERMAL_TEMP_INVALID into the temperature return memory
location instead of returning an error when the firmware is not
running or it is not of the right type.

Also, to clearly separate the handling of invalid temperature
values from the thermal zone initialization, introduce a special
THERMAL_TEMP_INIT value specifically for the latter purpose.

Fixes: a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid")
Closes: https://lore.kernel.org/linux-pm/[email protected]/
Reported-by: Eric Biggers <[email protected]>
Reported-by: Stefan Lippers-Hollmann <[email protected]>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=201761
Tested-by: Oleksandr Natalenko <[email protected]>
Tested-by: Stefan Lippers-Hollmann <[email protected]>
Cc: 6.10+ <[email protected]> # 6.10+
Signed-off-by: Rafael J. Wysocki <[email protected]>
Link: https://patch.msgid.link/[email protected]
[ rjw: Rebased on top of the current mainline ]
Signed-off-by: Rafael J. Wysocki <[email protected]>
drivers/net/wireless/intel/iwlwifi/mvm/tt.c
drivers/thermal/thermal_core.c
drivers/thermal/thermal_core.h
drivers/thermal/thermal_helpers.c

index ed0796aff7225f113d08c4b5afaf220b55977517..d92470960b38b624cf1fba4fbd2773b3a0f7d630 100644 (file)
@@ -621,8 +621,14 @@ static int iwl_mvm_tzone_get_temp(struct thermal_zone_device *device,
        guard(mvm)(mvm);
 
        if (!iwl_mvm_firmware_running(mvm) ||
-           mvm->fwrt.cur_fw_img != IWL_UCODE_REGULAR)
-               return -ENODATA;
+           mvm->fwrt.cur_fw_img != IWL_UCODE_REGULAR) {
+               /*
+                * Tell the core that there is no valid temperature value to
+                * return, but it need not worry about this.
+                */
+               *temperature = THERMAL_TEMP_INVALID;
+               return 0;
+       }
 
        ret = iwl_mvm_get_temp(mvm, &temp);
        if (ret)
index 8795187fbc526eb18512c7ec78f869fea2a81143..f6e700e48aadb8c0f0ab13e449a30823ce815a13 100644 (file)
@@ -300,8 +300,6 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
                thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
        else if (tz->polling_delay_jiffies)
                thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
-       else if (tz->temperature == THERMAL_TEMP_INVALID)
-               thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
 }
 
 static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz)
@@ -382,7 +380,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
        td->threshold = trip->temperature;
 
        if (tz->last_temperature >= old_threshold &&
-           tz->last_temperature != THERMAL_TEMP_INVALID) {
+           tz->last_temperature != THERMAL_TEMP_INIT) {
                /*
                 * Mitigation is under way, so it needs to stop if the zone
                 * temperature falls below the low temperature of the trip.
@@ -417,27 +415,6 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
        }
 }
 
-static void update_temperature(struct thermal_zone_device *tz)
-{
-       int temp, ret;
-
-       ret = __thermal_zone_get_temp(tz, &temp);
-       if (ret) {
-               if (ret != -EAGAIN)
-                       dev_warn(&tz->device,
-                                "failed to read out thermal zone (%d)\n",
-                                ret);
-               return;
-       }
-
-       tz->last_temperature = tz->temperature;
-       tz->temperature = temp;
-
-       trace_thermal_temperature(tz);
-
-       thermal_genl_sampling_temp(tz->id, temp);
-}
-
 static void thermal_zone_device_check(struct work_struct *work)
 {
        struct thermal_zone_device *tz = container_of(work, struct
@@ -452,7 +429,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
 
        INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check);
 
-       tz->temperature = THERMAL_TEMP_INVALID;
+       tz->temperature = THERMAL_TEMP_INIT;
        tz->passive = 0;
        tz->prev_low_trip = -INT_MAX;
        tz->prev_high_trip = INT_MAX;
@@ -504,6 +481,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
        struct thermal_trip_desc *td;
        LIST_HEAD(way_down_list);
        LIST_HEAD(way_up_list);
+       int temp, ret;
 
        if (tz->suspended)
                return;
@@ -511,10 +489,29 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
        if (!thermal_zone_device_is_enabled(tz))
                return;
 
-       update_temperature(tz);
+       ret = __thermal_zone_get_temp(tz, &temp);
+       if (ret) {
+               if (ret != -EAGAIN)
+                       dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
 
-       if (tz->temperature == THERMAL_TEMP_INVALID)
+               thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
+               return;
+       } else if (temp <= THERMAL_TEMP_INVALID) {
+               /*
+                * Special case: No valid temperature value is available, but
+                * the zone owner does not want the core to do anything about
+                * it.  Continue regular zone polling if needed, so that this
+                * function can be called again, but skip everything else.
+                */
                goto monitor;
+       }
+
+       tz->last_temperature = tz->temperature;
+       tz->temperature = temp;
+
+       trace_thermal_temperature(tz);
+
+       thermal_genl_sampling_temp(tz->id, temp);
 
        tz->notify_event = event;
 
index 30c0e78859a79d7799a35bd8c730616cd4d7afae..ba8e6fc807cadbc01f557bcdab20dfb0263978a1 100644 (file)
@@ -133,6 +133,9 @@ struct thermal_zone_device {
        struct thermal_trip_desc trips[] __counted_by(num_trips);
 };
 
+/* Initial thermal zone temperature. */
+#define THERMAL_TEMP_INIT      INT_MIN
+
 /*
  * Default delay after a failing thermal zone temperature check before
  * attempting to check it again.
index 81e019493557f00493720e1cee8a25828cac65c2..aedb8369e2aa2dc84455841609388c9738fa4a5f 100644 (file)
@@ -163,6 +163,8 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
        }
 
        ret = __thermal_zone_get_temp(tz, temp);
+       if (!ret && *temp <= THERMAL_TEMP_INVALID)
+               ret = -ENODATA;
 
 unlock:
        mutex_unlock(&tz->lock);
This page took 0.07296 seconds and 4 git commands to generate.