Skip to content

Commit

Permalink
Merge tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/…
Browse files Browse the repository at this point in the history
…kernel/git/rafael/linux-pm

Pull thermal control fix from Rafael Wysocki:
 "Prevent the thermal core from flooding the kernel log with useless
  messages if thermal zone temperature can never be determined (or its
  sensor has failed permanently) and make it finally give up and disable
  defective thermal zones (Rafael Wysocki)"

* tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
  thermal: core: Back off when polling thermal zones on errors
  thermal: trip: Split thermal_zone_device_set_mode()
  • Loading branch information
torvalds committed Jul 27, 2024
2 parents 7b0acd9 + f7c1b0e commit 1fcaa5d
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 14 deletions.
89 changes: 78 additions & 11 deletions drivers/thermal/thermal_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,44 @@ static int __init thermal_register_governors(void)
return ret;
}

static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
enum thermal_device_mode mode)
{
if (tz->ops.change_mode) {
int ret;

ret = tz->ops.change_mode(tz, mode);
if (ret)
return ret;
}

tz->mode = mode;

return 0;
}

static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
{
struct thermal_trip_desc *td;

dev_err(&tz->device, "Unable to get temperature, disabling!\n");
/*
* This function only runs for enabled thermal zones, so no need to
* check for the current mode.
*/
__thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
thermal_notify_tz_disable(tz);

for_each_trip_desc(tz, td) {
if (td->trip.type == THERMAL_TRIP_CRITICAL &&
td->trip.temperature > THERMAL_TEMP_INVALID) {
dev_crit(&tz->device,
"Disabled thermal zone with critical trip point\n");
return;
}
}
}

/*
* Zone update section: main control loop applied to each zone while monitoring
* in polling mode. The monitoring is done using a workqueue.
Expand All @@ -292,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
cancel_delayed_work(&tz->poll_queue);
}

static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
{
if (error == -EAGAIN) {
thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
return;
}

/*
* Print the message once to reduce log noise. It will be followed by
* another one if the temperature cannot be determined after multiple
* attempts.
*/
if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
dev_info(&tz->device, "Temperature check failed (%d)\n", error);

thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);

tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
thermal_zone_broken_disable(tz);
/*
* Restore the original recheck delay value to allow the thermal
* zone to try to recover when it is reenabled by user space.
*/
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
}
}

static void monitor_thermal_zone(struct thermal_zone_device *tz)
{
if (tz->mode != THERMAL_DEVICE_ENABLED)
Expand Down Expand Up @@ -491,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,

ret = __thermal_zone_get_temp(tz, &temp);
if (ret) {
if (ret != -EAGAIN)
dev_info(&tz->device, "Temperature check failed (%d)\n", ret);

thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
thermal_zone_recheck(tz, ret);
return;
} else if (temp <= THERMAL_TEMP_INVALID) {
/*
Expand All @@ -506,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
goto monitor;
}

tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;

tz->last_temperature = tz->temperature;
tz->temperature = temp;

Expand Down Expand Up @@ -540,22 +605,23 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
enum thermal_device_mode mode)
{
int ret = 0;
int ret;

mutex_lock(&tz->lock);

/* do nothing if mode isn't changing */
if (mode == tz->mode) {
mutex_unlock(&tz->lock);

return ret;
return 0;
}

if (tz->ops.change_mode)
ret = tz->ops.change_mode(tz, mode);
ret = __thermal_zone_device_set_mode(tz, mode);
if (ret) {
mutex_unlock(&tz->lock);

if (!ret)
tz->mode = mode;
return ret;
}

__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

Expand All @@ -566,7 +632,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
else
thermal_notify_tz_disable(tz);

return ret;
return 0;
}

int thermal_zone_device_enable(struct thermal_zone_device *tz)
Expand Down Expand Up @@ -1445,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,

thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;

/* sys I/F */
/* Add nodes that are always present via .groups */
Expand Down
10 changes: 7 additions & 3 deletions drivers/thermal/thermal_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ struct thermal_governor {
* @polling_delay_jiffies: number of jiffies to wait between polls when
* checking whether trip points have been crossed (0 for
* interrupt driven systems)
* @recheck_delay_jiffies: delay after a failed attempt to determine the zone
* temperature before trying again
* @temperature: current temperature. This is only for core code,
* drivers should use thermal_zone_get_temp() to get the
* current temperature
Expand Down Expand Up @@ -108,6 +110,7 @@ struct thermal_zone_device {
int num_trips;
unsigned long passive_delay_jiffies;
unsigned long polling_delay_jiffies;
unsigned long recheck_delay_jiffies;
int temperature;
int last_temperature;
int emul_temperature;
Expand Down Expand Up @@ -137,10 +140,11 @@ struct thermal_zone_device {
#define THERMAL_TEMP_INIT INT_MIN

/*
* Default delay after a failing thermal zone temperature check before
* attempting to check it again.
* Default and maximum delay after a failed thermal zone temperature check
* before attempting to check it again (in jiffies).
*/
#define THERMAL_RECHECK_DELAY_MS 250
#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250)
#define THERMAL_MAX_RECHECK_DELAY (120 * HZ)

/* Default Thermal Governor */
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
Expand Down

0 comments on commit 1fcaa5d

Please sign in to comment.