diff options
Diffstat (limited to 'drivers/cpuidle')
-rw-r--r-- | drivers/cpuidle/Kconfig | 23 | ||||
-rw-r--r-- | drivers/cpuidle/Makefile | 6 | ||||
-rw-r--r-- | drivers/cpuidle/coupled.c | 727 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 542 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.h | 65 | ||||
-rw-r--r-- | drivers/cpuidle/driver.c | 96 | ||||
-rw-r--r-- | drivers/cpuidle/governor.c | 141 | ||||
-rw-r--r-- | drivers/cpuidle/governors/Makefile | 6 | ||||
-rw-r--r-- | drivers/cpuidle/governors/ladder.c | 201 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 438 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 433 |
11 files changed, 2678 insertions, 0 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig new file mode 100644 index 00000000..a76b689e --- /dev/null +++ b/drivers/cpuidle/Kconfig @@ -0,0 +1,23 @@ + +config CPU_IDLE + bool "CPU idle PM support" + default y if ACPI || PPC_PSERIES + help + CPU idle is a generic framework for supporting software-controlled + idle processor power management. It includes modular cross-platform + governors that can be swapped during runtime. + + If you're using an ACPI-enabled platform, you should say Y here. + +config CPU_IDLE_GOV_LADDER + bool + depends on CPU_IDLE + default y + +config CPU_IDLE_GOV_MENU + bool + depends on CPU_IDLE && NO_HZ + default y + +config ARCH_NEEDS_CPU_IDLE_COUPLED + def_bool n diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile new file mode 100644 index 00000000..38c8f69f --- /dev/null +++ b/drivers/cpuidle/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for cpuidle. +# + +obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ +obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c new file mode 100644 index 00000000..e95c72e9 --- /dev/null +++ b/drivers/cpuidle/coupled.c @@ -0,0 +1,727 @@ +/* + * coupled.c - helper functions to enter the same idle state on multiple cpus + * + * Copyright (c) 2011 Google, Inc. + * + * Author: Colin Cross <ccross@android.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include "cpuidle.h" + +/** + * DOC: Coupled cpuidle states + * + * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the + * cpus cannot be independently powered down, either due to + * sequencing restrictions (on Tegra 2, cpu 0 must be the last to + * power down), or due to HW bugs (on OMAP4460, a cpu powering up + * will corrupt the gic state unless the other cpu runs a work + * around). Each cpu has a power state that it can enter without + * coordinating with the other cpu (usually Wait For Interrupt, or + * WFI), and one or more "coupled" power states that affect blocks + * shared between the cpus (L2 cache, interrupt controller, and + * sometimes the whole SoC). Entering a coupled power state must + * be tightly controlled on both cpus. + * + * This file implements a solution, where each cpu will wait in the + * WFI state until all cpus are ready to enter a coupled state, at + * which point the coupled state function will be called on all + * cpus at approximately the same time. + * + * Once all cpus are ready to enter idle, they are woken by an smp + * cross call. At this point, there is a chance that one of the + * cpus will find work to do, and choose not to enter idle. A + * final pass is needed to guarantee that all cpus will call the + * power state enter function at the same time. During this pass, + * each cpu will increment the ready counter, and continue once the + * ready counter matches the number of online coupled cpus. If any + * cpu exits idle, the other cpus will decrement their counter and + * retry. + * + * requested_state stores the deepest coupled idle state each cpu + * is ready for. It is assumed that the states are indexed from + * shallowest (highest power, lowest exit latency) to deepest + * (lowest power, highest exit latency). The requested_state + * variable is not locked. It is only written from the cpu that + * it stores (or by the on/offlining cpu if that cpu is offline), + * and only read after all the cpus are ready for the coupled idle + * state are are no longer updating it. + * + * Three atomic counters are used. alive_count tracks the number + * of cpus in the coupled set that are currently or soon will be + * online. waiting_count tracks the number of cpus that are in + * the waiting loop, in the ready loop, or in the coupled idle state. + * ready_count tracks the number of cpus that are in the ready loop + * or in the coupled idle state. + * + * To use coupled cpuidle states, a cpuidle driver must: + * + * Set struct cpuidle_device.coupled_cpus to the mask of all + * coupled cpus, usually the same as cpu_possible_mask if all cpus + * are part of the same cluster. The coupled_cpus mask must be + * set in the struct cpuidle_device for each cpu. + * + * Set struct cpuidle_device.safe_state to a state that is not a + * coupled state. This is usually WFI. + * + * Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each + * state that affects multiple cpus. + * + * Provide a struct cpuidle_state.enter function for each state + * that affects multiple cpus. This function is guaranteed to be + * called on all cpus at approximately the same time. The driver + * should ensure that the cpus all abort together if any cpu tries + * to abort once the function is called. The function should return + * with interrupts still disabled. + */ + +/** + * struct cpuidle_coupled - data for set of cpus that share a coupled idle state + * @coupled_cpus: mask of cpus that are part of the coupled set + * @requested_state: array of requested states for cpus in the coupled set + * @ready_waiting_counts: combined count of cpus in ready or waiting loops + * @online_count: count of cpus that are online + * @refcnt: reference count of cpuidle devices that are using this struct + * @prevent: flag to prevent coupled idle while a cpu is hotplugging + */ +struct cpuidle_coupled { + cpumask_t coupled_cpus; + int requested_state[NR_CPUS]; + atomic_t ready_waiting_counts; + int online_count; + int refcnt; + int prevent; +}; + +#define WAITING_BITS 16 +#define MAX_WAITING_CPUS (1 << WAITING_BITS) +#define WAITING_MASK (MAX_WAITING_CPUS - 1) +#define READY_MASK (~WAITING_MASK) + +#define CPUIDLE_COUPLED_NOT_IDLE (-1) + +static DEFINE_MUTEX(cpuidle_coupled_lock); +static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb); + +/* + * The cpuidle_coupled_poked_mask mask is used to avoid calling + * __smp_call_function_single with the per cpu call_single_data struct already + * in use. This prevents a deadlock where two cpus are waiting for each others + * call_single_data struct to be available + */ +static cpumask_t cpuidle_coupled_poked_mask; + +/** + * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus + * @dev: cpuidle_device of the calling cpu + * @a: atomic variable to hold the barrier + * + * No caller to this function will return from this function until all online + * cpus in the same coupled group have called this function. Once any caller + * has returned from this function, the barrier is immediately available for + * reuse. + * + * The atomic variable a must be initialized to 0 before any cpu calls + * this function, will be reset to 0 before any cpu returns from this function. + * + * Must only be called from within a coupled idle state handler + * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set). + * + * Provides full smp barrier semantics before and after calling. + */ +void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) +{ + int n = dev->coupled->online_count; + + smp_mb__before_atomic_inc(); + atomic_inc(a); + + while (atomic_read(a) < n) + cpu_relax(); + + if (atomic_inc_return(a) == n * 2) { + atomic_set(a, 0); + return; + } + + while (atomic_read(a) > n) + cpu_relax(); +} + +/** + * cpuidle_state_is_coupled - check if a state is part of a coupled set + * @dev: struct cpuidle_device for the current cpu + * @drv: struct cpuidle_driver for the platform + * @state: index of the target state in drv->states + * + * Returns true if the target state is coupled with cpus besides this one + */ +bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state) +{ + return drv->states[state].flags & CPUIDLE_FLAG_COUPLED; +} + +/** + * cpuidle_coupled_set_ready - mark a cpu as ready + * @coupled: the struct coupled that contains the current cpu + */ +static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled) +{ + atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_set_not_ready - mark a cpu as not ready + * @coupled: the struct coupled that contains the current cpu + * + * Decrements the ready counter, unless the ready (and thus the waiting) counter + * is equal to the number of online cpus. Prevents a race where one cpu + * decrements the waiting counter and then re-increments it just before another + * cpu has decremented its ready counter, leading to the ready counter going + * down from the number of online cpus without going through the coupled idle + * state. + * + * Returns 0 if the counter was decremented successfully, -EINVAL if the ready + * counter was equal to the number of online cpus. + */ +static +inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled) +{ + int all; + int ret; + + all = coupled->online_count || (coupled->online_count << WAITING_BITS); + ret = atomic_add_unless(&coupled->ready_waiting_counts, + -MAX_WAITING_CPUS, all); + + return ret ? 0 : -EINVAL; +} + +/** + * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the ready loop. + */ +static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == 0; +} + +/** + * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the ready loop + */ +static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == coupled->online_count; +} + +/** + * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the wait loop + */ +static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == coupled->online_count; +} + +/** + * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the waiting loop. + */ +static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == 0; +} + +/** + * cpuidle_coupled_get_state - determine the deepest idle state + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Returns the deepest idle state that all coupled cpus can enter + */ +static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev, + struct cpuidle_coupled *coupled) +{ + int i; + int state = INT_MAX; + + /* + * Read barrier ensures that read of requested_state is ordered after + * reads of ready_count. Matches the write barriers + * cpuidle_set_state_waiting. + */ + smp_rmb(); + + for_each_cpu_mask(i, coupled->coupled_cpus) + if (cpu_online(i) && coupled->requested_state[i] < state) + state = coupled->requested_state[i]; + + return state; +} + +static void cpuidle_coupled_poked(void *info) +{ + int cpu = (unsigned long)info; + cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask); +} + +/** + * cpuidle_coupled_poke - wake up a cpu that may be waiting + * @cpu: target cpu + * + * Ensures that the target cpu exits it's waiting idle state (if it is in it) + * and will see updates to waiting_count before it re-enters it's waiting idle + * state. + * + * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu + * either has or will soon have a pending IPI that will wake it out of idle, + * or it is currently processing the IPI and is not in idle. + */ +static void cpuidle_coupled_poke(int cpu) +{ + struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu); + + if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poked_mask)) + __smp_call_function_single(cpu, csd, 0); +} + +/** + * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Calls cpuidle_coupled_poke on all other online cpus. + */ +static void cpuidle_coupled_poke_others(int this_cpu, + struct cpuidle_coupled *coupled) +{ + int cpu; + + for_each_cpu_mask(cpu, coupled->coupled_cpus) + if (cpu != this_cpu && cpu_online(cpu)) + cpuidle_coupled_poke(cpu); +} + +/** + * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * @next_state: the index in drv->states of the requested state for this cpu + * + * Updates the requested idle state for the specified cpuidle device, + * poking all coupled cpus out of idle if necessary to let them see the new + * state. + */ +static void cpuidle_coupled_set_waiting(int cpu, + struct cpuidle_coupled *coupled, int next_state) +{ + int w; + + coupled->requested_state[cpu] = next_state; + + /* + * If this is the last cpu to enter the waiting state, poke + * all the other cpus out of their waiting state so they can + * enter a deeper state. This can race with one of the cpus + * exiting the waiting state due to an interrupt and + * decrementing waiting_count, see comment below. + * + * The atomic_inc_return provides a write barrier to order the write + * to requested_state with the later write that increments ready_count. + */ + w = atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK; + if (w == coupled->online_count) + cpuidle_coupled_poke_others(cpu, coupled); +} + +/** + * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Removes the requested idle state for the specified cpuidle device. + */ +static void cpuidle_coupled_set_not_waiting(int cpu, + struct cpuidle_coupled *coupled) +{ + /* + * Decrementing waiting count can race with incrementing it in + * cpuidle_coupled_set_waiting, but that's OK. Worst case, some + * cpus will increment ready_count and then spin until they + * notice that this cpu has cleared it's requested_state. + */ + atomic_dec(&coupled->ready_waiting_counts); + + coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE; +} + +/** + * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop + * @cpu: the current cpu + * @coupled: the struct coupled that contains the current cpu + * + * Marks this cpu as no longer in the ready and waiting loops. Decrements + * the waiting count first to prevent another cpu looping back in and seeing + * this cpu as waiting just before it exits idle. + */ +static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled) +{ + cpuidle_coupled_set_not_waiting(cpu, coupled); + atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed + * @cpu - this cpu + * + * Turns on interrupts and spins until any outstanding poke interrupts have + * been processed and the poke bit has been cleared. + * + * Other interrupts may also be processed while interrupts are enabled, so + * need_resched() must be tested after turning interrupts off again to make sure + * the interrupt didn't schedule work that should take the cpu out of idle. + * + * Returns 0 if need_resched was false, -EINTR if need_resched was true. + */ +static int cpuidle_coupled_clear_pokes(int cpu) +{ + local_irq_enable(); + while (cpumask_test_cpu(cpu, &cpuidle_coupled_poked_mask)) + cpu_relax(); + local_irq_disable(); + + return need_resched() ? -EINTR : 0; +} + +/** + * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus + * @dev: struct cpuidle_device for the current cpu + * @drv: struct cpuidle_driver for the platform + * @next_state: index of the requested state in drv->states + * + * Coordinate with coupled cpus to enter the target state. This is a two + * stage process. In the first stage, the cpus are operating independently, + * and may call into cpuidle_enter_state_coupled at completely different times. + * To save as much power as possible, the first cpus to call this function will + * go to an intermediate state (the cpuidle_device's safe state), and wait for + * all the other cpus to call this function. Once all coupled cpus are idle, + * the second stage will start. Each coupled cpu will spin until all cpus have + * guaranteed that they will call the target_state. + * + * This function must be called with interrupts disabled. It may enable + * interrupts while preparing for idle, and it will always return with + * interrupts enabled. + */ +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + int entered_state = -1; + struct cpuidle_coupled *coupled = dev->coupled; + + if (!coupled) + return -EINVAL; + + while (coupled->prevent) { + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + local_irq_enable(); + return entered_state; + } + entered_state = cpuidle_enter_state(dev, drv, + dev->safe_state_index); + } + + /* Read barrier ensures online_count is read after prevent is cleared */ + smp_rmb(); + + cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state); + +retry: + /* + * Wait for all coupled cpus to be idle, using the deepest state + * allowed for a single cpu. + */ + while (!cpuidle_coupled_cpus_waiting(coupled)) { + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + if (coupled->prevent) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + entered_state = cpuidle_enter_state(dev, drv, + dev->safe_state_index); + } + + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + /* + * All coupled cpus are probably idle. There is a small chance that + * one of the other cpus just became active. Increment the ready count, + * and spin until all coupled cpus have incremented the counter. Once a + * cpu has incremented the ready counter, it cannot abort idle and must + * spin until either all cpus have incremented the ready counter, or + * another cpu leaves idle and decrements the waiting counter. + */ + + cpuidle_coupled_set_ready(coupled); + while (!cpuidle_coupled_cpus_ready(coupled)) { + /* Check if any other cpus bailed out of idle. */ + if (!cpuidle_coupled_cpus_waiting(coupled)) + if (!cpuidle_coupled_set_not_ready(coupled)) + goto retry; + + cpu_relax(); + } + + /* all cpus have acked the coupled state */ + next_state = cpuidle_coupled_get_state(dev, coupled); + + entered_state = cpuidle_enter_state(dev, drv, next_state); + + cpuidle_coupled_set_done(dev->cpu, coupled); + +out: + /* + * Normal cpuidle states are expected to return with irqs enabled. + * That leads to an inefficiency where a cpu receiving an interrupt + * that brings it out of idle will process that interrupt before + * exiting the idle enter function and decrementing ready_count. All + * other cpus will need to spin waiting for the cpu that is processing + * the interrupt. If the driver returns with interrupts disabled, + * all other cpus will loop back into the safe idle state instead of + * spinning, saving power. + * + * Calling local_irq_enable here allows coupled states to return with + * interrupts disabled, but won't cause problems for drivers that + * exit with interrupts enabled. + */ + local_irq_enable(); + + /* + * Wait until all coupled cpus have exited idle. There is no risk that + * a cpu exits and re-enters the ready state because this cpu has + * already decremented its waiting_count. + */ + while (!cpuidle_coupled_no_cpus_ready(coupled)) + cpu_relax(); + + return entered_state; +} + +static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled) +{ + cpumask_t cpus; + cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); + coupled->online_count = cpumask_weight(&cpus); +} + +/** + * cpuidle_coupled_register_device - register a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_register_device to handle coupled idle init. Finds the + * cpuidle_coupled struct for this set of coupled cpus, or creates one if none + * exists yet. + */ +int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + int cpu; + struct cpuidle_device *other_dev; + struct call_single_data *csd; + struct cpuidle_coupled *coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return 0; + + for_each_cpu_mask(cpu, dev->coupled_cpus) { + other_dev = per_cpu(cpuidle_devices, cpu); + if (other_dev && other_dev->coupled) { + coupled = other_dev->coupled; + goto have_coupled; + } + } + + /* No existing coupled info found, create a new one */ + coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL); + if (!coupled) + return -ENOMEM; + + coupled->coupled_cpus = dev->coupled_cpus; + +have_coupled: + dev->coupled = coupled; + if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus))) + coupled->prevent++; + + cpuidle_coupled_update_online_cpus(coupled); + + coupled->refcnt++; + + csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); + csd->func = cpuidle_coupled_poked; + csd->info = (void *)(unsigned long)dev->cpu; + + return 0; +} + +/** + * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_unregister_device to tear down coupled idle. Removes the + * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if + * this was the last cpu in the set. + */ +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ + struct cpuidle_coupled *coupled = dev->coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return; + + if (--coupled->refcnt) + kfree(coupled); + dev->coupled = NULL; +} + +/** + * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Disables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* Force all cpus out of the waiting loop. */ + coupled->prevent++; + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); + while (!cpuidle_coupled_no_cpus_waiting(coupled)) + cpu_relax(); +} + +/** + * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Enables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* + * Write barrier ensures readers see the new online_count when they + * see prevent == false. + */ + smp_wmb(); + coupled->prevent--; + /* Force cpus out of the prevent loop. */ + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); +} + +/** + * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions + * @nb: notifier block + * @action: hotplug transition + * @hcpu: target cpu number + * + * Called when a cpu is brought on or offline using hotplug. Updates the + * coupled cpu set appropriately + */ +static int cpuidle_coupled_cpu_notify(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct cpuidle_device *dev; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + case CPU_ONLINE: + case CPU_DEAD: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + break; + default: + return NOTIFY_OK; + } + + mutex_lock(&cpuidle_lock); + + dev = per_cpu(cpuidle_devices, cpu); + if (!dev->coupled) + goto out; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + cpuidle_coupled_prevent_idle(dev->coupled); + break; + case CPU_ONLINE: + case CPU_DEAD: + cpuidle_coupled_update_online_cpus(dev->coupled); + /* Fall through */ + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + cpuidle_coupled_allow_idle(dev->coupled); + break; + } + +out: + mutex_unlock(&cpuidle_lock); + return NOTIFY_OK; +} + +static struct notifier_block cpuidle_coupled_cpu_notifier = { + .notifier_call = cpuidle_coupled_cpu_notify, +}; + +static int __init cpuidle_coupled_init(void) +{ + return register_cpu_notifier(&cpuidle_coupled_cpu_notifier); +} +core_initcall(cpuidle_coupled_init); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c new file mode 100644 index 00000000..e81cfda2 --- /dev/null +++ b/drivers/cpuidle/cpuidle.c @@ -0,0 +1,542 @@ +/* + * cpuidle.c - core cpuidle infrastructure + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/notifier.h> +#include <linux/pm_qos.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/module.h> +#include <trace/events/power.h> + +#include "cpuidle.h" + +DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices); + +DEFINE_MUTEX(cpuidle_lock); +LIST_HEAD(cpuidle_detected_devices); + +static int enabled_devices; +static int off __read_mostly; +static int initialized __read_mostly; + +int cpuidle_disabled(void) +{ + return off; +} +void disable_cpuidle(void) +{ + off = 1; +} + +#if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT) +static void cpuidle_kick_cpus(void) +{ + cpu_idle_wait(); +} +#elif defined(CONFIG_SMP) +# error "Arch needs cpu_idle_wait() equivalent here" +#else /* !CONFIG_ARCH_HAS_CPU_IDLE_WAIT && !CONFIG_SMP */ +static void cpuidle_kick_cpus(void) {} +#endif + +static int __cpuidle_register_device(struct cpuidle_device *dev); + +static inline int cpuidle_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + struct cpuidle_state *target_state = &drv->states[index]; + return target_state->enter(dev, drv, index); +} + +static inline int cpuidle_enter_tk(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + return cpuidle_wrap_enter(dev, drv, index, cpuidle_enter); +} + +typedef int (*cpuidle_enter_t)(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static cpuidle_enter_t cpuidle_enter_ops; + +/** + * cpuidle_play_dead - cpu off-lining + * + * Returns in case of an error or no driver + */ +int cpuidle_play_dead(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_driver(); + int i, dead_state = -1; + int power_usage = -1; + + if (!drv) + return -ENODEV; + + /* Find lowest-power state that supports long-term idle */ + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + if (s->power_usage < power_usage && s->enter_dead) { + power_usage = s->power_usage; + dead_state = i; + } + } + + if (dead_state != -1) + return drv->states[dead_state].enter_dead(dev, dead_state); + + return -ENODEV; +} + +/** + * cpuidle_enter_state - enter the state and update stats + * @dev: cpuidle device for this cpu + * @drv: cpuidle driver for this cpu + * @next_state: index into drv->states of the state to enter + */ +int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, + int next_state) +{ + int entered_state; + + entered_state = cpuidle_enter_ops(dev, drv, next_state); + + if (entered_state >= 0) { + /* Update cpuidle counters */ + /* This can be moved to within driver enter routine + * but that results in multiple copies of same code. + */ + dev->states_usage[entered_state].time += + (unsigned long long)dev->last_residency; + dev->states_usage[entered_state].usage++; + } else { + dev->last_residency = 0; + } + + return entered_state; +} + +/** + * cpuidle_idle_call - the main idle loop + * + * NOTE: no locks or semaphores should be used here + * return non-zero on failure + */ +int cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_driver(); + int next_state, entered_state; + + if (off) + return -ENODEV; + + if (!initialized) + return -ENODEV; + + /* check if the device is ready */ + if (!dev || !dev->enabled) + return -EBUSY; + +#if 0 + /* shows regressions, re-enable for 2.6.29 */ + /* + * run any timers that can be run now, at this point + * before calculating the idle duration etc. + */ + hrtimer_peek_ahead_timers(); +#endif + + /* ask the governor for the next state */ + next_state = cpuidle_curr_governor->select(drv, dev); + if (need_resched()) { + local_irq_enable(); + return 0; + } + + trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu); + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + if (cpuidle_state_is_coupled(dev, drv, next_state)) + entered_state = cpuidle_enter_state_coupled(dev, drv, + next_state); + else + entered_state = cpuidle_enter_state(dev, drv, next_state); + + trace_power_end_rcuidle(dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + /* give the governor an opportunity to reflect on the outcome */ + if (cpuidle_curr_governor->reflect) + cpuidle_curr_governor->reflect(dev, entered_state); + + return 0; +} + +/** + * cpuidle_install_idle_handler - installs the cpuidle idle loop handler + */ +void cpuidle_install_idle_handler(void) +{ + if (enabled_devices) { + /* Make sure all changes finished before we switch to new idle */ + smp_wmb(); + initialized = 1; + } +} + +/** + * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler + */ +void cpuidle_uninstall_idle_handler(void) +{ + if (enabled_devices) { + initialized = 0; + cpuidle_kick_cpus(); + } +} + +/** + * cpuidle_pause_and_lock - temporarily disables CPUIDLE + */ +void cpuidle_pause_and_lock(void) +{ + mutex_lock(&cpuidle_lock); + cpuidle_uninstall_idle_handler(); +} + +EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock); + +/** + * cpuidle_resume_and_unlock - resumes CPUIDLE operation + */ +void cpuidle_resume_and_unlock(void) +{ + cpuidle_install_idle_handler(); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock); + +/** + * cpuidle_wrap_enter - performs timekeeping and irqen around enter function + * @dev: pointer to a valid cpuidle_device object + * @drv: pointer to a valid cpuidle_driver object + * @index: index of the target cpuidle state. + */ +int cpuidle_wrap_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index, + int (*enter)(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index)) +{ + ktime_t time_start, time_end; + s64 diff; + + time_start = ktime_get(); + + index = enter(dev, drv, index); + + time_end = ktime_get(); + + local_irq_enable(); + + diff = ktime_to_us(ktime_sub(time_end, time_start)); + if (diff > INT_MAX) + diff = INT_MAX; + + dev->last_residency = (int) diff; + + return index; +} + +#ifdef CONFIG_ARCH_HAS_CPU_RELAX +static int poll_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + ktime_t t1, t2; + s64 diff; + + t1 = ktime_get(); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + + t2 = ktime_get(); + diff = ktime_to_us(ktime_sub(t2, t1)); + if (diff > INT_MAX) + diff = INT_MAX; + + dev->last_residency = (int) diff; + + return index; +} + +static void poll_idle_init(struct cpuidle_driver *drv) +{ + struct cpuidle_state *state = &drv->states[0]; + + snprintf(state->name, CPUIDLE_NAME_LEN, "POLL"); + snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); + state->exit_latency = 0; + state->target_residency = 0; + state->power_usage = -1; + state->flags = 0; + state->enter = poll_idle; + state->disable = 0; +} +#else +static void poll_idle_init(struct cpuidle_driver *drv) {} +#endif /* CONFIG_ARCH_HAS_CPU_RELAX */ + +/** + * cpuidle_enable_device - enables idle PM for a CPU + * @dev: the CPU + * + * This function must be called between cpuidle_pause_and_lock and + * cpuidle_resume_and_unlock when used externally. + */ +int cpuidle_enable_device(struct cpuidle_device *dev) +{ + int ret, i; + struct cpuidle_driver *drv = cpuidle_get_driver(); + + if (dev->enabled) + return 0; + if (!drv || !cpuidle_curr_governor) + return -EIO; + if (!dev->state_count) + dev->state_count = drv->state_count; + + if (dev->registered == 0) { + ret = __cpuidle_register_device(dev); + if (ret) + return ret; + } + + cpuidle_enter_ops = drv->en_core_tk_irqen ? + cpuidle_enter_tk : cpuidle_enter; + + poll_idle_init(drv); + + if ((ret = cpuidle_add_state_sysfs(dev))) + return ret; + + if (cpuidle_curr_governor->enable && + (ret = cpuidle_curr_governor->enable(drv, dev))) + goto fail_sysfs; + + for (i = 0; i < dev->state_count; i++) { + dev->states_usage[i].usage = 0; + dev->states_usage[i].time = 0; + } + dev->last_residency = 0; + + smp_wmb(); + + dev->enabled = 1; + + enabled_devices++; + return 0; + +fail_sysfs: + cpuidle_remove_state_sysfs(dev); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_enable_device); + +/** + * cpuidle_disable_device - disables idle PM for a CPU + * @dev: the CPU + * + * This function must be called between cpuidle_pause_and_lock and + * cpuidle_resume_and_unlock when used externally. + */ +void cpuidle_disable_device(struct cpuidle_device *dev) +{ + if (!dev->enabled) + return; + if (!cpuidle_get_driver() || !cpuidle_curr_governor) + return; + + dev->enabled = 0; + + if (cpuidle_curr_governor->disable) + cpuidle_curr_governor->disable(cpuidle_get_driver(), dev); + + cpuidle_remove_state_sysfs(dev); + enabled_devices--; +} + +EXPORT_SYMBOL_GPL(cpuidle_disable_device); + +/** + * __cpuidle_register_device - internal register function called before register + * and enable routines + * @dev: the cpu + * + * cpuidle_lock mutex must be held before this is called + */ +static int __cpuidle_register_device(struct cpuidle_device *dev) +{ + int ret; + struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu); + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); + + if (!dev) + return -EINVAL; + if (!try_module_get(cpuidle_driver->owner)) + return -EINVAL; + + init_completion(&dev->kobj_unregister); + + per_cpu(cpuidle_devices, dev->cpu) = dev; + list_add(&dev->device_list, &cpuidle_detected_devices); + ret = cpuidle_add_sysfs(cpu_dev); + if (ret) + goto err_sysfs; + + ret = cpuidle_coupled_register_device(dev); + if (ret) + goto err_coupled; + + dev->registered = 1; + return 0; + +err_coupled: + cpuidle_remove_sysfs(cpu_dev); + wait_for_completion(&dev->kobj_unregister); +err_sysfs: + list_del(&dev->device_list); + per_cpu(cpuidle_devices, dev->cpu) = NULL; + module_put(cpuidle_driver->owner); + return ret; +} + +/** + * cpuidle_register_device - registers a CPU's idle PM feature + * @dev: the cpu + */ +int cpuidle_register_device(struct cpuidle_device *dev) +{ + int ret; + + mutex_lock(&cpuidle_lock); + + if ((ret = __cpuidle_register_device(dev))) { + mutex_unlock(&cpuidle_lock); + return ret; + } + + cpuidle_enable_device(dev); + cpuidle_install_idle_handler(); + + mutex_unlock(&cpuidle_lock); + + return 0; + +} + +EXPORT_SYMBOL_GPL(cpuidle_register_device); + +/** + * cpuidle_unregister_device - unregisters a CPU's idle PM feature + * @dev: the cpu + */ +void cpuidle_unregister_device(struct cpuidle_device *dev) +{ + struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu); + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); + + if (dev->registered == 0) + return; + + cpuidle_pause_and_lock(); + + cpuidle_disable_device(dev); + + cpuidle_remove_sysfs(cpu_dev); + list_del(&dev->device_list); + wait_for_completion(&dev->kobj_unregister); + per_cpu(cpuidle_devices, dev->cpu) = NULL; + + cpuidle_coupled_unregister_device(dev); + + cpuidle_resume_and_unlock(); + + module_put(cpuidle_driver->owner); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_device); + +#ifdef CONFIG_SMP + +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency + * requirement. This means we need to get all processors out of their C-state, + * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that + * wakes them all right up. + */ +static int cpuidle_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 1); + return NOTIFY_OK; +} + +static struct notifier_block cpuidle_latency_notifier = { + .notifier_call = cpuidle_latency_notify, +}; + +static inline void latency_notifier_init(struct notifier_block *n) +{ + pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n); +} + +#else /* CONFIG_SMP */ + +#define latency_notifier_init(x) do { } while (0) + +#endif /* CONFIG_SMP */ + +/** + * cpuidle_init - core initializer + */ +static int __init cpuidle_init(void) +{ + int ret; + + if (cpuidle_disabled()) + return -ENODEV; + + ret = cpuidle_add_interface(cpu_subsys.dev_root); + if (ret) + return ret; + + latency_notifier_init(&cpuidle_latency_notifier); + + return 0; +} + +module_param(off, int, 0444); +core_initcall(cpuidle_init); diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h new file mode 100644 index 00000000..76e7f696 --- /dev/null +++ b/drivers/cpuidle/cpuidle.h @@ -0,0 +1,65 @@ +/* + * cpuidle.h - The internal header file + */ + +#ifndef __DRIVER_CPUIDLE_H +#define __DRIVER_CPUIDLE_H + +#include <linux/device.h> + +/* For internal use only */ +extern struct cpuidle_governor *cpuidle_curr_governor; +extern struct list_head cpuidle_governors; +extern struct list_head cpuidle_detected_devices; +extern struct mutex cpuidle_lock; +extern spinlock_t cpuidle_driver_lock; +extern int cpuidle_disabled(void); +extern int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); + +/* idle loop */ +extern void cpuidle_install_idle_handler(void); +extern void cpuidle_uninstall_idle_handler(void); + +/* governors */ +extern int cpuidle_switch_governor(struct cpuidle_governor *gov); + +/* sysfs */ +extern int cpuidle_add_interface(struct device *dev); +extern void cpuidle_remove_interface(struct device *dev); +extern int cpuidle_add_state_sysfs(struct cpuidle_device *device); +extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device); +extern int cpuidle_add_sysfs(struct device *dev); +extern void cpuidle_remove_sysfs(struct device *dev); + +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED +bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state); +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); +int cpuidle_coupled_register_device(struct cpuidle_device *dev); +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev); +#else +static inline bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state) +{ + return false; +} + +static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + return -1; +} + +static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + return 0; +} + +static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ +} +#endif + +#endif /* __DRIVER_CPUIDLE_H */ diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c new file mode 100644 index 00000000..40cd3f30 --- /dev/null +++ b/drivers/cpuidle/driver.c @@ -0,0 +1,96 @@ +/* + * driver.c - driver support + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/mutex.h> +#include <linux/module.h> +#include <linux/cpuidle.h> + +#include "cpuidle.h" + +static struct cpuidle_driver *cpuidle_curr_driver; +DEFINE_SPINLOCK(cpuidle_driver_lock); + +static void __cpuidle_register_driver(struct cpuidle_driver *drv) +{ + int i; + /* + * cpuidle driver should set the drv->power_specified bit + * before registering if the driver provides + * power_usage numbers. + * + * If power_specified is not set, + * we fill in power_usage with decreasing values as the + * cpuidle code has an implicit assumption that state Cn + * uses less power than C(n-1). + * + * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned + * an power value of -1. So we use -2, -3, etc, for other + * c-states. + */ + if (!drv->power_specified) { + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) + drv->states[i].power_usage = -1 - i; + } +} + + +/** + * cpuidle_register_driver - registers a driver + * @drv: the driver + */ +int cpuidle_register_driver(struct cpuidle_driver *drv) +{ + if (!drv || !drv->state_count) + return -EINVAL; + + if (cpuidle_disabled()) + return -ENODEV; + + spin_lock(&cpuidle_driver_lock); + if (cpuidle_curr_driver) { + spin_unlock(&cpuidle_driver_lock); + return -EBUSY; + } + __cpuidle_register_driver(drv); + cpuidle_curr_driver = drv; + spin_unlock(&cpuidle_driver_lock); + + return 0; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_driver); + +/** + * cpuidle_get_driver - return the current driver + */ +struct cpuidle_driver *cpuidle_get_driver(void) +{ + return cpuidle_curr_driver; +} +EXPORT_SYMBOL_GPL(cpuidle_get_driver); + +/** + * cpuidle_unregister_driver - unregisters a driver + * @drv: the driver + */ +void cpuidle_unregister_driver(struct cpuidle_driver *drv) +{ + if (drv != cpuidle_curr_driver) { + WARN(1, "invalid cpuidle_unregister_driver(%s)\n", + drv->name); + return; + } + + spin_lock(&cpuidle_driver_lock); + cpuidle_curr_driver = NULL; + spin_unlock(&cpuidle_driver_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_driver); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c new file mode 100644 index 00000000..ea2f8e7a --- /dev/null +++ b/drivers/cpuidle/governor.c @@ -0,0 +1,141 @@ +/* + * governor.c - governor support + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/mutex.h> +#include <linux/module.h> +#include <linux/cpuidle.h> + +#include "cpuidle.h" + +LIST_HEAD(cpuidle_governors); +struct cpuidle_governor *cpuidle_curr_governor; + +/** + * __cpuidle_find_governor - finds a governor of the specified name + * @str: the name + * + * Must be called with cpuidle_lock acquired. + */ +static struct cpuidle_governor * __cpuidle_find_governor(const char *str) +{ + struct cpuidle_governor *gov; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) + if (!strnicmp(str, gov->name, CPUIDLE_NAME_LEN)) + return gov; + + return NULL; +} + +/** + * cpuidle_switch_governor - changes the governor + * @gov: the new target governor + * + * NOTE: "gov" can be NULL to specify disabled + * Must be called with cpuidle_lock acquired. + */ +int cpuidle_switch_governor(struct cpuidle_governor *gov) +{ + struct cpuidle_device *dev; + + if (gov == cpuidle_curr_governor) + return 0; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_governor) { + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_disable_device(dev); + module_put(cpuidle_curr_governor->owner); + } + + cpuidle_curr_governor = gov; + + if (gov) { + if (!try_module_get(cpuidle_curr_governor->owner)) + return -EINVAL; + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_enable_device(dev); + cpuidle_install_idle_handler(); + printk(KERN_INFO "cpuidle: using governor %s\n", gov->name); + } + + return 0; +} + +/** + * cpuidle_register_governor - registers a governor + * @gov: the governor + */ +int cpuidle_register_governor(struct cpuidle_governor *gov) +{ + int ret = -EEXIST; + + if (!gov || !gov->select) + return -EINVAL; + + if (cpuidle_disabled()) + return -ENODEV; + + mutex_lock(&cpuidle_lock); + if (__cpuidle_find_governor(gov->name) == NULL) { + ret = 0; + list_add_tail(&gov->governor_list, &cpuidle_governors); + if (!cpuidle_curr_governor || + cpuidle_curr_governor->rating < gov->rating) + cpuidle_switch_governor(gov); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +/** + * cpuidle_replace_governor - find a replacement governor + * @exclude_rating: the rating that will be skipped while looking for + * new governor. + */ +static struct cpuidle_governor *cpuidle_replace_governor(int exclude_rating) +{ + struct cpuidle_governor *gov; + struct cpuidle_governor *ret_gov = NULL; + unsigned int max_rating = 0; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) { + if (gov->rating == exclude_rating) + continue; + if (gov->rating > max_rating) { + max_rating = gov->rating; + ret_gov = gov; + } + } + + return ret_gov; +} + +/** + * cpuidle_unregister_governor - unregisters a governor + * @gov: the governor + */ +void cpuidle_unregister_governor(struct cpuidle_governor *gov) +{ + if (!gov) + return; + + mutex_lock(&cpuidle_lock); + if (gov == cpuidle_curr_governor) { + struct cpuidle_governor *new_gov; + new_gov = cpuidle_replace_governor(gov->rating); + cpuidle_switch_governor(new_gov); + } + list_del(&gov->governor_list); + mutex_unlock(&cpuidle_lock); +} + diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile new file mode 100644 index 00000000..1b512722 --- /dev/null +++ b/drivers/cpuidle/governors/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for cpuidle governors. +# + +obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o +obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c new file mode 100644 index 00000000..b6a09ea8 --- /dev/null +++ b/drivers/cpuidle/governors/ladder.c @@ -0,0 +1,201 @@ +/* + * ladder.c - the residency ladder algorithm + * + * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de> + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/pm_qos.h> +#include <linux/module.h> +#include <linux/jiffies.h> + +#include <asm/io.h> +#include <asm/uaccess.h> + +#define PROMOTION_COUNT 4 +#define DEMOTION_COUNT 1 + +struct ladder_device_state { + struct { + u32 promotion_count; + u32 demotion_count; + u32 promotion_time; + u32 demotion_time; + } threshold; + struct { + int promotion_count; + int demotion_count; + } stats; +}; + +struct ladder_device { + struct ladder_device_state states[CPUIDLE_STATE_MAX]; + int last_state_idx; +}; + +static DEFINE_PER_CPU(struct ladder_device, ladder_devices); + +/** + * ladder_do_selection - prepares private data for a state change + * @ldev: the ladder device + * @old_idx: the current state index + * @new_idx: the new target state index + */ +static inline void ladder_do_selection(struct ladder_device *ldev, + int old_idx, int new_idx) +{ + ldev->states[old_idx].stats.promotion_count = 0; + ldev->states[old_idx].stats.demotion_count = 0; + ldev->last_state_idx = new_idx; +} + +/** + * ladder_select_state - selects the next state to enter + * @drv: cpuidle driver + * @dev: the CPU + */ +static int ladder_select_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct ladder_device *ldev = &__get_cpu_var(ladder_devices); + struct ladder_device_state *last_state; + int last_residency, last_idx = ldev->last_state_idx; + int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); + + /* Special case when user has set very strict latency requirement */ + if (unlikely(latency_req == 0)) { + ladder_do_selection(ldev, last_idx, 0); + return 0; + } + + last_state = &ldev->states[last_idx]; + + if (drv->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID) { + last_residency = cpuidle_get_last_residency(dev) - \ + drv->states[last_idx].exit_latency; + } + else + last_residency = last_state->threshold.promotion_time + 1; + + /* consider promotion */ + if (last_idx < drv->state_count - 1 && + last_residency > last_state->threshold.promotion_time && + drv->states[last_idx + 1].exit_latency <= latency_req) { + last_state->stats.promotion_count++; + last_state->stats.demotion_count = 0; + if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { + ladder_do_selection(ldev, last_idx, last_idx + 1); + return last_idx + 1; + } + } + + /* consider demotion */ + if (last_idx > CPUIDLE_DRIVER_STATE_START && + drv->states[last_idx].exit_latency > latency_req) { + int i; + + for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) { + if (drv->states[i].exit_latency <= latency_req) + break; + } + ladder_do_selection(ldev, last_idx, i); + return i; + } + + if (last_idx > CPUIDLE_DRIVER_STATE_START && + last_residency < last_state->threshold.demotion_time) { + last_state->stats.demotion_count++; + last_state->stats.promotion_count = 0; + if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { + ladder_do_selection(ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + /* otherwise remain at the current state */ + return last_idx; +} + +/** + * ladder_enable_device - setup for the governor + * @drv: cpuidle driver + * @dev: the CPU + */ +static int ladder_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int i; + struct ladder_device *ldev = &per_cpu(ladder_devices, dev->cpu); + struct ladder_device_state *lstate; + struct cpuidle_state *state; + + ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START; + + for (i = 0; i < drv->state_count; i++) { + state = &drv->states[i]; + lstate = &ldev->states[i]; + + lstate->stats.promotion_count = 0; + lstate->stats.demotion_count = 0; + + lstate->threshold.promotion_count = PROMOTION_COUNT; + lstate->threshold.demotion_count = DEMOTION_COUNT; + + if (i < drv->state_count - 1) + lstate->threshold.promotion_time = state->exit_latency; + if (i > 0) + lstate->threshold.demotion_time = state->exit_latency; + } + + return 0; +} + +/** + * ladder_reflect - update the correct last_state_idx + * @dev: the CPU + * @index: the index of actual state entered + */ +static void ladder_reflect(struct cpuidle_device *dev, int index) +{ + struct ladder_device *ldev = &__get_cpu_var(ladder_devices); + if (index > 0) + ldev->last_state_idx = index; +} + +static struct cpuidle_governor ladder_governor = { + .name = "ladder", + .rating = 10, + .enable = ladder_enable_device, + .select = ladder_select_state, + .reflect = ladder_reflect, + .owner = THIS_MODULE, +}; + +/** + * init_ladder - initializes the governor + */ +static int __init init_ladder(void) +{ + return cpuidle_register_governor(&ladder_governor); +} + +/** + * exit_ladder - exits the governor + */ +static void __exit exit_ladder(void) +{ + cpuidle_unregister_governor(&ladder_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_ladder); +module_exit(exit_ladder); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c new file mode 100644 index 00000000..746f1e6b --- /dev/null +++ b/drivers/cpuidle/governors/menu.c @@ -0,0 +1,438 @@ +/* + * menu.c - the menu idle governor + * + * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> + * Copyright (C) 2009 Intel Corporation + * Author: + * Arjan van de Ven <arjan@linux.intel.com> + * + * This code is licenced under the GPL version 2 as described + * in the COPYING file that acompanies the Linux Kernel. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/pm_qos.h> +#include <linux/time.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/module.h> + +#define BUCKETS 12 +#define INTERVALS 8 +#define RESOLUTION 1024 +#define DECAY 8 +#define MAX_INTERESTING 50000 +#define STDDEV_THRESH 400 + + +/* + * Concepts and ideas behind the menu governor + * + * For the menu governor, there are 3 decision factors for picking a C + * state: + * 1) Energy break even point + * 2) Performance impact + * 3) Latency tolerance (from pmqos infrastructure) + * These these three factors are treated independently. + * + * Energy break even point + * ----------------------- + * C state entry and exit have an energy cost, and a certain amount of time in + * the C state is required to actually break even on this cost. CPUIDLE + * provides us this duration in the "target_residency" field. So all that we + * need is a good prediction of how long we'll be idle. Like the traditional + * menu governor, we start with the actual known "next timer event" time. + * + * Since there are other source of wakeups (interrupts for example) than + * the next timer event, this estimation is rather optimistic. To get a + * more realistic estimate, a correction factor is applied to the estimate, + * that is based on historic behavior. For example, if in the past the actual + * duration always was 50% of the next timer tick, the correction factor will + * be 0.5. + * + * menu uses a running average for this correction factor, however it uses a + * set of factors, not just a single factor. This stems from the realization + * that the ratio is dependent on the order of magnitude of the expected + * duration; if we expect 500 milliseconds of idle time the likelihood of + * getting an interrupt very early is much higher than if we expect 50 micro + * seconds of idle time. A second independent factor that has big impact on + * the actual factor is if there is (disk) IO outstanding or not. + * (as a special twist, we consider every sleep longer than 50 milliseconds + * as perfect; there are no power gains for sleeping longer than this) + * + * For these two reasons we keep an array of 12 independent factors, that gets + * indexed based on the magnitude of the expected duration as well as the + * "is IO outstanding" property. + * + * Repeatable-interval-detector + * ---------------------------- + * There are some cases where "next timer" is a completely unusable predictor: + * Those cases where the interval is fixed, for example due to hardware + * interrupt mitigation, but also due to fixed transfer rate devices such as + * mice. + * For this, we use a different predictor: We track the duration of the last 8 + * intervals and if the stand deviation of these 8 intervals is below a + * threshold value, we use the average of these intervals as prediction. + * + * Limiting Performance Impact + * --------------------------- + * C states, especially those with large exit latencies, can have a real + * noticeable impact on workloads, which is not acceptable for most sysadmins, + * and in addition, less performance has a power price of its own. + * + * As a general rule of thumb, menu assumes that the following heuristic + * holds: + * The busier the system, the less impact of C states is acceptable + * + * This rule-of-thumb is implemented using a performance-multiplier: + * If the exit latency times the performance multiplier is longer than + * the predicted duration, the C state is not considered a candidate + * for selection due to a too high performance impact. So the higher + * this multiplier is, the longer we need to be idle to pick a deep C + * state, and thus the less likely a busy CPU will hit such a deep + * C state. + * + * Two factors are used in determing this multiplier: + * a value of 10 is added for each point of "per cpu load average" we have. + * a value of 5 points is added for each process that is waiting for + * IO on this CPU. + * (these values are experimentally determined) + * + * The load average factor gives a longer term (few seconds) input to the + * decision, while the iowait value gives a cpu local instantanious input. + * The iowait factor may look low, but realize that this is also already + * represented in the system load average. + * + */ + +struct menu_device { + int last_state_idx; + int needs_update; + + unsigned int expected_us; + u64 predicted_us; + unsigned int exit_us; + unsigned int bucket; + u64 correction_factor[BUCKETS]; + u32 intervals[INTERVALS]; + int interval_ptr; +}; + + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int get_loadavg(void) +{ + unsigned long this = this_cpu_load(); + + + return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10; +} + +static inline int which_bucket(unsigned int duration) +{ + int bucket = 0; + + /* + * We keep two groups of stats; one with no + * IO pending, one without. + * This allows us to calculate + * E(duration)|iowait + */ + if (nr_iowait_cpu(smp_processor_id())) + bucket = BUCKETS/2; + + if (duration < 10) + return bucket; + if (duration < 100) + return bucket + 1; + if (duration < 1000) + return bucket + 2; + if (duration < 10000) + return bucket + 3; + if (duration < 100000) + return bucket + 4; + return bucket + 5; +} + +/* + * Return a multiplier for the exit latency that is intended + * to take performance requirements into account. + * The more performance critical we estimate the system + * to be, the higher this multiplier, and thus the higher + * the barrier to go to an expensive C state. + */ +static inline int performance_multiplier(void) +{ + int mult = 1; + + /* for higher loadavg, we are more reluctant */ + + /* + * this doesn't work as intended - it is almost always 0, but can + * sometimes, depending on workload, spike very high into the hundreds + * even when the average cpu load is under 10%. + */ + /* mult += 2 * get_loadavg(); */ + + /* for IO wait tasks (per cpu!) we add 5x each */ + mult += 10 * nr_iowait_cpu(smp_processor_id()); + + return mult; +} + +static DEFINE_PER_CPU(struct menu_device, menu_devices); + +static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + +/* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */ +static u64 div_round64(u64 dividend, u32 divisor) +{ + return div_u64(dividend + (divisor / 2), divisor); +} + +/* + * Try detecting repeating patterns by keeping track of the last 8 + * intervals, and checking if the standard deviation of that set + * of points is below a threshold. If it is... then use the + * average of these 8 points as the estimated value. + */ +static void detect_repeating_patterns(struct menu_device *data) +{ + int i; + uint64_t avg = 0; + uint64_t stddev = 0; /* contains the square of the std deviation */ + + /* first calculate average and standard deviation of the past */ + for (i = 0; i < INTERVALS; i++) + avg += data->intervals[i]; + avg = avg / INTERVALS; + + /* if the avg is beyond the known next tick, it's worthless */ + if (avg > data->expected_us) + return; + + for (i = 0; i < INTERVALS; i++) + stddev += (data->intervals[i] - avg) * + (data->intervals[i] - avg); + + stddev = stddev / INTERVALS; + + /* + * now.. if stddev is small.. then assume we have a + * repeating pattern and predict we keep doing this. + */ + + if (avg && stddev < STDDEV_THRESH) + data->predicted_us = avg; +} + +/** + * menu_select - selects the next idle state to enter + * @drv: cpuidle driver containing state data + * @dev: the CPU + */ +static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); + int power_usage = -1; + int i; + int multiplier; + struct timespec t; + + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; + } + + data->last_state_idx = 0; + data->exit_us = 0; + + /* Special case when user has set very strict latency requirement */ + if (unlikely(latency_req == 0)) + return 0; + + /* determine the expected residency time, round up */ + t = ktime_to_timespec(tick_nohz_get_sleep_length()); + data->expected_us = + t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC; + + + data->bucket = which_bucket(data->expected_us); + + multiplier = performance_multiplier(); + + /* + * if the correction factor is 0 (eg first time init or cpu hotplug + * etc), we actually want to start out with a unity factor. + */ + if (data->correction_factor[data->bucket] == 0) + data->correction_factor[data->bucket] = RESOLUTION * DECAY; + + /* Make sure to round up for half microseconds */ + data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], + RESOLUTION * DECAY); + + detect_repeating_patterns(data); + + /* + * We want to default to C1 (hlt), not to busy polling + * unless the timer is happening really really soon. + */ + if (data->expected_us > 5 && + drv->states[CPUIDLE_DRIVER_STATE_START].disable == 0) + data->last_state_idx = CPUIDLE_DRIVER_STATE_START; + + /* + * Find the idle state with the lowest power while satisfying + * our constraints. + */ + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + if (s->disable) + continue; + if (s->target_residency > data->predicted_us) + continue; + if (s->exit_latency > latency_req) + continue; + if (s->exit_latency * multiplier > data->predicted_us) + continue; + + if (s->power_usage < power_usage) { + power_usage = s->power_usage; + data->last_state_idx = i; + data->exit_us = s->exit_latency; + } + } + + return data->last_state_idx; +} + +/** + * menu_reflect - records that data structures need update + * @dev: the CPU + * @index: the index of actual entered state + * + * NOTE: it's important to be fast here because this operation will add to + * the overall exit latency. + */ +static void menu_reflect(struct cpuidle_device *dev, int index) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + data->last_state_idx = index; + if (index >= 0) + data->needs_update = 1; +} + +/** + * menu_update - attempts to guess what happened after entry + * @drv: cpuidle driver containing state data + * @dev: the CPU + */ +static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int last_idx = data->last_state_idx; + unsigned int last_idle_us = cpuidle_get_last_residency(dev); + struct cpuidle_state *target = &drv->states[last_idx]; + unsigned int measured_us; + u64 new_factor; + + /* + * Ugh, this idle state doesn't support residency measurements, so we + * are basically lost in the dark. As a compromise, assume we slept + * for the whole expected time. + */ + if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID))) + last_idle_us = data->expected_us; + + + measured_us = last_idle_us; + + /* + * We correct for the exit latency; we are assuming here that the + * exit latency happens after the event that we're interested in. + */ + if (measured_us > data->exit_us) + measured_us -= data->exit_us; + + + /* update our correction ratio */ + + new_factor = data->correction_factor[data->bucket] + * (DECAY - 1) / DECAY; + + if (data->expected_us > 0 && measured_us < MAX_INTERESTING) + new_factor += RESOLUTION * measured_us / data->expected_us; + else + /* + * we were idle so long that we count it as a perfect + * prediction + */ + new_factor += RESOLUTION; + + /* + * We don't want 0 as factor; we always want at least + * a tiny bit of estimated time. + */ + if (new_factor == 0) + new_factor = 1; + + data->correction_factor[data->bucket] = new_factor; + + /* update the repeating-pattern data */ + data->intervals[data->interval_ptr++] = last_idle_us; + if (data->interval_ptr >= INTERVALS) + data->interval_ptr = 0; +} + +/** + * menu_enable_device - scans a CPU's states and does setup + * @drv: cpuidle driver + * @dev: the CPU + */ +static int menu_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + + memset(data, 0, sizeof(struct menu_device)); + + return 0; +} + +static struct cpuidle_governor menu_governor = { + .name = "menu", + .rating = 20, + .enable = menu_enable_device, + .select = menu_select, + .reflect = menu_reflect, + .owner = THIS_MODULE, +}; + +/** + * init_menu - initializes the governor + */ +static int __init init_menu(void) +{ + return cpuidle_register_governor(&menu_governor); +} + +/** + * exit_menu - exits the governor + */ +static void __exit exit_menu(void) +{ + cpuidle_unregister_governor(&menu_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_menu); +module_exit(exit_menu); diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c new file mode 100644 index 00000000..88032b4d --- /dev/null +++ b/drivers/cpuidle/sysfs.c @@ -0,0 +1,433 @@ +/* + * sysfs.c - sysfs support + * + * (C) 2006-2007 Shaohua Li <shaohua.li@intel.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/capability.h> + +#include "cpuidle.h" + +static unsigned int sysfs_switch; +static int __init cpuidle_sysfs_setup(char *unused) +{ + sysfs_switch = 1; + return 1; +} +__setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup); + +static ssize_t show_available_governors(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t i = 0; + struct cpuidle_governor *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (i >= (ssize_t) ((PAGE_SIZE/sizeof(char)) - CPUIDLE_NAME_LEN - 2)) + goto out; + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + } + +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_current_driver(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret; + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); + + spin_lock(&cpuidle_driver_lock); + if (cpuidle_driver) + ret = sprintf(buf, "%s\n", cpuidle_driver->name); + else + ret = sprintf(buf, "none\n"); + spin_unlock(&cpuidle_driver_lock); + + return ret; +} + +static ssize_t show_current_governor(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret; + + mutex_lock(&cpuidle_lock); + if (cpuidle_curr_governor) + ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + else + ret = sprintf(buf, "none\n"); + mutex_unlock(&cpuidle_lock); + + return ret; +} + +static ssize_t store_current_governor(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char gov_name[CPUIDLE_NAME_LEN]; + int ret = -EINVAL; + size_t len = count; + struct cpuidle_governor *gov; + + if (!len || len >= sizeof(gov_name)) + return -EINVAL; + + memcpy(gov_name, buf, len); + gov_name[len] = '\0'; + if (gov_name[len - 1] == '\n') + gov_name[--len] = '\0'; + + mutex_lock(&cpuidle_lock); + + list_for_each_entry(gov, &cpuidle_governors, governor_list) { + if (strlen(gov->name) == len && !strcmp(gov->name, gov_name)) { + ret = cpuidle_switch_governor(gov); + break; + } + } + + mutex_unlock(&cpuidle_lock); + + if (ret) + return ret; + else + return count; +} + +static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL); +static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL); + +static struct attribute *cpuidle_default_attrs[] = { + &dev_attr_current_driver.attr, + &dev_attr_current_governor_ro.attr, + NULL +}; + +static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL); +static DEVICE_ATTR(current_governor, 0644, show_current_governor, + store_current_governor); + +static struct attribute *cpuidle_switch_attrs[] = { + &dev_attr_available_governors.attr, + &dev_attr_current_driver.attr, + &dev_attr_current_governor.attr, + NULL +}; + +static struct attribute_group cpuidle_attr_group = { + .attrs = cpuidle_default_attrs, + .name = "cpuidle", +}; + +/** + * cpuidle_add_interface - add CPU global sysfs attributes + */ +int cpuidle_add_interface(struct device *dev) +{ + if (sysfs_switch) + cpuidle_attr_group.attrs = cpuidle_switch_attrs; + + return sysfs_create_group(&dev->kobj, &cpuidle_attr_group); +} + +/** + * cpuidle_remove_interface - remove CPU global sysfs attributes + */ +void cpuidle_remove_interface(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &cpuidle_attr_group); +} + +struct cpuidle_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_device *, char *); + ssize_t (*store)(struct cpuidle_device *, const char *, size_t count); +}; + +#define define_one_ro(_name, show) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0444, show, NULL) +#define define_one_rw(_name, show, store) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0644, show, store) + +#define kobj_to_cpuidledev(k) container_of(k, struct cpuidle_device, kobj) +#define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr) +static ssize_t cpuidle_show(struct kobject * kobj, struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->show) { + mutex_lock(&cpuidle_lock); + ret = cattr->show(dev, buf); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static ssize_t cpuidle_store(struct kobject * kobj, struct attribute * attr, + const char * buf, size_t count) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->store) { + mutex_lock(&cpuidle_lock); + ret = cattr->store(dev, buf, count); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static const struct sysfs_ops cpuidle_sysfs_ops = { + .show = cpuidle_show, + .store = cpuidle_store, +}; + +static void cpuidle_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + + complete(&dev->kobj_unregister); +} + +static struct kobj_type ktype_cpuidle = { + .sysfs_ops = &cpuidle_sysfs_ops, + .release = cpuidle_sysfs_release, +}; + +struct cpuidle_state_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_state *, \ + struct cpuidle_state_usage *, char *); + ssize_t (*store)(struct cpuidle_state *, const char *, size_t); +}; + +#define define_one_state_ro(_name, show) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL) + +#define define_one_state_rw(_name, show, store) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store) + +#define define_show_state_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, char *buf) \ +{ \ + return sprintf(buf, "%u\n", state->_name);\ +} + +#define define_store_state_function(_name) \ +static ssize_t store_state_##_name(struct cpuidle_state *state, \ + const char *buf, size_t size) \ +{ \ + long value; \ + int err; \ + if (!capable(CAP_SYS_ADMIN)) \ + return -EPERM; \ + err = kstrtol(buf, 0, &value); \ + if (err) \ + return err; \ + if (value) \ + state->disable = 1; \ + else \ + state->disable = 0; \ + return size; \ +} + +#define define_show_state_ull_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, char *buf) \ +{ \ + return sprintf(buf, "%llu\n", state_usage->_name);\ +} + +#define define_show_state_str_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, char *buf) \ +{ \ + if (state->_name[0] == '\0')\ + return sprintf(buf, "<null>\n");\ + return sprintf(buf, "%s\n", state->_name);\ +} + +define_show_state_function(exit_latency) +define_show_state_function(power_usage) +define_show_state_ull_function(usage) +define_show_state_ull_function(time) +define_show_state_str_function(name) +define_show_state_str_function(desc) +define_show_state_function(disable) +define_store_state_function(disable) + +define_one_state_ro(name, show_state_name); +define_one_state_ro(desc, show_state_desc); +define_one_state_ro(latency, show_state_exit_latency); +define_one_state_ro(power, show_state_power_usage); +define_one_state_ro(usage, show_state_usage); +define_one_state_ro(time, show_state_time); +define_one_state_rw(disable, show_state_disable, store_state_disable); + +static struct attribute *cpuidle_state_default_attrs[] = { + &attr_name.attr, + &attr_desc.attr, + &attr_latency.attr, + &attr_power.attr, + &attr_usage.attr, + &attr_time.attr, + &attr_disable.attr, + NULL +}; + +#define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) +#define kobj_to_state(k) (kobj_to_state_obj(k)->state) +#define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage) +#define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) +static ssize_t cpuidle_state_show(struct kobject * kobj, + struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); + struct cpuidle_state_attr * cattr = attr_to_stateattr(attr); + + if (cattr->show) + ret = cattr->show(state, state_usage, buf); + + return ret; +} + +static ssize_t cpuidle_state_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); + + if (cattr->store) + ret = cattr->store(state, buf, size); + + return ret; +} + +static const struct sysfs_ops cpuidle_state_sysfs_ops = { + .show = cpuidle_state_show, + .store = cpuidle_state_store, +}; + +static void cpuidle_state_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_state_kobj *state_obj = kobj_to_state_obj(kobj); + + complete(&state_obj->kobj_unregister); +} + +static struct kobj_type ktype_state_cpuidle = { + .sysfs_ops = &cpuidle_state_sysfs_ops, + .default_attrs = cpuidle_state_default_attrs, + .release = cpuidle_state_sysfs_release, +}; + +static inline void cpuidle_free_state_kobj(struct cpuidle_device *device, int i) +{ + kobject_put(&device->kobjs[i]->kobj); + wait_for_completion(&device->kobjs[i]->kobj_unregister); + kfree(device->kobjs[i]); + device->kobjs[i] = NULL; +} + +/** + * cpuidle_add_driver_sysfs - adds driver-specific sysfs attributes + * @device: the target device + */ +int cpuidle_add_state_sysfs(struct cpuidle_device *device) +{ + int i, ret = -ENOMEM; + struct cpuidle_state_kobj *kobj; + struct cpuidle_driver *drv = cpuidle_get_driver(); + + /* state statistics */ + for (i = 0; i < device->state_count; i++) { + kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); + if (!kobj) + goto error_state; + kobj->state = &drv->states[i]; + kobj->state_usage = &device->states_usage[i]; + init_completion(&kobj->kobj_unregister); + + ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, &device->kobj, + "state%d", i); + if (ret) { + kfree(kobj); + goto error_state; + } + kobject_uevent(&kobj->kobj, KOBJ_ADD); + device->kobjs[i] = kobj; + } + + return 0; + +error_state: + for (i = i - 1; i >= 0; i--) + cpuidle_free_state_kobj(device, i); + return ret; +} + +/** + * cpuidle_remove_driver_sysfs - removes driver-specific sysfs attributes + * @device: the target device + */ +void cpuidle_remove_state_sysfs(struct cpuidle_device *device) +{ + int i; + + for (i = 0; i < device->state_count; i++) + cpuidle_free_state_kobj(device, i); +} + +/** + * cpuidle_add_sysfs - creates a sysfs instance for the target device + * @dev: the target device + */ +int cpuidle_add_sysfs(struct device *cpu_dev) +{ + int cpu = cpu_dev->id; + struct cpuidle_device *dev; + int error; + + dev = per_cpu(cpuidle_devices, cpu); + error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &cpu_dev->kobj, + "cpuidle"); + if (!error) + kobject_uevent(&dev->kobj, KOBJ_ADD); + return error; +} + +/** + * cpuidle_remove_sysfs - deletes a sysfs instance on the target device + * @dev: the target device + */ +void cpuidle_remove_sysfs(struct device *cpu_dev) +{ + int cpu = cpu_dev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + kobject_put(&dev->kobj); +} |