summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
authorSrikant Patnaik2015-01-11 12:28:04 +0530
committerSrikant Patnaik2015-01-11 12:28:04 +0530
commit871480933a1c28f8a9fed4c4d34d06c439a7a422 (patch)
tree8718f573808810c2a1e8cb8fb6ac469093ca2784 /kernel/time
parent9d40ac5867b9aefe0722bc1f110b965ff294d30d (diff)
downloadFOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.gz
FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.bz2
FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.zip
Moved, renamed, and deleted files
The original directory structure was scattered and unorganized. Changes are basically to make it look like kernel structure.
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig35
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/alarmtimer.c850
-rw-r--r--kernel/time/clockevents.c441
-rw-r--r--kernel/time/clocksource.c959
-rw-r--r--kernel/time/jiffies.c97
-rw-r--r--kernel/time/ntp.c965
-rw-r--r--kernel/time/posix-clock.c446
-rw-r--r--kernel/time/tick-broadcast.c629
-rw-r--r--kernel/time/tick-common.c419
-rw-r--r--kernel/time/tick-internal.h144
-rw-r--r--kernel/time/tick-oneshot.c116
-rw-r--r--kernel/time/tick-sched.c912
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c1280
-rw-r--r--kernel/time/timer_list.c301
-rw-r--r--kernel/time/timer_stats.c425
18 files changed, 8348 insertions, 0 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 00000000..a20dc8a3
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,35 @@
+#
+# Timer subsystem related configuration options
+#
+
+# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
+config TICK_ONESHOT
+ bool
+
+config NO_HZ
+ bool "Tickless System (Dynamic Ticks)"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables a tickless system: timer interrupts will
+ only trigger on an as-needed basis both when the system is
+ busy and when the system is idle.
+
+config HIGH_RES_TIMERS
+ bool "High Resolution Timer Support"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables high resolution timer support. If your
+ hardware is not capable then this option only increases
+ the size of the kernel image.
+
+config GENERIC_CLOCKEVENTS_BUILD
+ bool
+ default y
+ depends on GENERIC_CLOCKEVENTS
+
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+ bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 00000000..e2fd74b8
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1,9 @@
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o alarmtimer.o
+
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 00000000..efc5cf7d
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,850 @@
+/*
+ * Alarmtimer interface
+ *
+ * This interface provides a timer which is similarto hrtimers,
+ * but triggers a RTC alarm if the box is suspend.
+ *
+ * This interface is influenced by the Android RTC Alarm timer
+ * interface.
+ *
+ * Copyright (C) 2010 IBM Corperation
+ *
+ * Author: John Stultz <john.stultz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/timerqueue.h>
+#include <linux/rtc.h>
+#include <linux/alarmtimer.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/posix-timers.h>
+#include <linux/workqueue.h>
+#include <linux/freezer.h>
+
+/**
+ * struct alarm_base - Alarm timer bases
+ * @lock: Lock for syncrhonized access to the base
+ * @timerqueue: Timerqueue head managing the list of events
+ * @timer: hrtimer used to schedule events while running
+ * @gettime: Function to read the time correlating to the base
+ * @base_clockid: clockid for the base
+ */
+static struct alarm_base {
+ spinlock_t lock;
+ struct timerqueue_head timerqueue;
+ struct hrtimer timer;
+ ktime_t (*gettime)(void);
+ clockid_t base_clockid;
+} alarm_bases[ALARM_NUMTYPE];
+
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+
+static struct wakeup_source *ws;
+
+#ifdef CONFIG_RTC_CLASS
+/* rtc timer and device for setting alarm wakeups at suspend */
+static struct rtc_timer rtctimer;
+static struct rtc_device *rtcdev;
+static DEFINE_SPINLOCK(rtcdev_lock);
+
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ unsigned long flags;
+ struct rtc_device *ret;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ ret = rtcdev;
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ return ret;
+}
+
+
+static int alarmtimer_rtc_add_device(struct device *dev,
+ struct class_interface *class_intf)
+{
+ unsigned long flags;
+ struct rtc_device *rtc = to_rtc_device(dev);
+
+ if (rtcdev)
+ return -EBUSY;
+
+ if (!rtc->ops->set_alarm)
+ return -1;
+ if (!device_may_wakeup(rtc->dev.parent))
+ return -1;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ if (!rtcdev) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ }
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+ return 0;
+}
+
+static inline void alarmtimer_rtc_timer_init(void)
+{
+ rtc_timer_init(&rtctimer, NULL, NULL);
+}
+
+static struct class_interface alarmtimer_rtc_interface = {
+ .add_dev = &alarmtimer_rtc_add_device,
+};
+
+static int alarmtimer_rtc_interface_setup(void)
+{
+ alarmtimer_rtc_interface.class = rtc_class;
+ return class_interface_register(&alarmtimer_rtc_interface);
+}
+static void alarmtimer_rtc_interface_remove(void)
+{
+ class_interface_unregister(&alarmtimer_rtc_interface);
+}
+#else
+struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ return NULL;
+}
+#define rtcdev (NULL)
+static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
+static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
+#endif
+
+/**
+ * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
+ * @base: pointer to the base where the timer is being run
+ * @alarm: pointer to alarm being enqueued.
+ *
+ * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * an hrtimer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
+{
+ timerqueue_add(&base->timerqueue, &alarm->node);
+ alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+
+ if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
+ hrtimer_try_to_cancel(&base->timer);
+ hrtimer_start(&base->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
+ }
+}
+
+/**
+ * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * @base: pointer to the base where the timer is running
+ * @alarm: pointer to alarm being removed
+ *
+ * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * a new timer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+{
+ struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
+
+ if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+ return;
+
+ timerqueue_del(&base->timerqueue, &alarm->node);
+ alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+
+ if (next == &alarm->node) {
+ hrtimer_try_to_cancel(&base->timer);
+ next = timerqueue_getnext(&base->timerqueue);
+ if (!next)
+ return;
+ hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
+ }
+}
+
+
+/**
+ * alarmtimer_fired - Handles alarm hrtimer being fired.
+ * @timer: pointer to hrtimer being run
+ *
+ * When a alarm timer fires, this runs through the timerqueue to
+ * see which alarms expired, and runs those. If there are more alarm
+ * timers queued for the future, we set the hrtimer to fire when
+ * when the next future alarm timer expires.
+ */
+static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
+{
+ struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+ struct timerqueue_node *next;
+ unsigned long flags;
+ ktime_t now;
+ int ret = HRTIMER_NORESTART;
+ int restart = ALARMTIMER_NORESTART;
+
+ spin_lock_irqsave(&base->lock, flags);
+ now = base->gettime();
+ while ((next = timerqueue_getnext(&base->timerqueue))) {
+ struct alarm *alarm;
+ ktime_t expired = next->expires;
+
+ if (expired.tv64 > now.tv64)
+ break;
+
+ alarm = container_of(next, struct alarm, node);
+
+ timerqueue_del(&base->timerqueue, &alarm->node);
+ alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+
+ alarm->state |= ALARMTIMER_STATE_CALLBACK;
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (alarm->function)
+ restart = alarm->function(alarm, now);
+ spin_lock_irqsave(&base->lock, flags);
+ alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+
+ if (restart != ALARMTIMER_NORESTART) {
+ timerqueue_add(&base->timerqueue, &alarm->node);
+ alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+ }
+ }
+
+ if (next) {
+ hrtimer_set_expires(&base->timer, next->expires);
+ ret = HRTIMER_RESTART;
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+
+}
+
+#ifdef CONFIG_RTC_CLASS
+/**
+ * alarmtimer_suspend - Suspend time callback
+ * @dev: unused
+ * @state: unused
+ *
+ * When we are going into suspend, we look through the bases
+ * to see which is the soonest timer to expire. We then
+ * set an rtc timer to fire that far into the future, which
+ * will wake us from suspend.
+ */
+static int alarmtimer_suspend(struct device *dev)
+{
+ struct rtc_time tm;
+ ktime_t min, now;
+ unsigned long flags;
+ struct rtc_device *rtc;
+ int i;
+ int ret;
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ min = freezer_delta;
+ freezer_delta = ktime_set(0, 0);
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+
+ rtc = alarmtimer_get_rtcdev();
+ /* If we have no rtcdev, just return */
+ if (!rtc)
+ return 0;
+
+ /* Find the soonest timer to expire*/
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ struct alarm_base *base = &alarm_bases[i];
+ struct timerqueue_node *next;
+ ktime_t delta;
+
+ spin_lock_irqsave(&base->lock, flags);
+ next = timerqueue_getnext(&base->timerqueue);
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (!next)
+ continue;
+ delta = ktime_sub(next->expires, base->gettime());
+ if (!min.tv64 || (delta.tv64 < min.tv64))
+ min = delta;
+ }
+ if (min.tv64 == 0)
+ return 0;
+
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
+
+ /* Setup an rtc timer to fire that far in the future */
+ rtc_timer_cancel(rtc, &rtctimer);
+ rtc_read_time(rtc, &tm);
+ now = rtc_tm_to_ktime(tm);
+ now = ktime_add(now, min);
+
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, 1 * MSEC_PER_SEC);
+ return ret;
+}
+
+extern void devalarm_triggered2(void);
+
+static int alarmtimer_resume(struct device *dev)
+{
+ devalarm_triggered2();
+ return 0;
+}
+
+#else
+static int alarmtimer_suspend(struct device *dev)
+{
+ return 0;
+}
+#endif
+
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+ ktime_t delta;
+ unsigned long flags;
+ struct alarm_base *base = &alarm_bases[type];
+
+ delta = ktime_sub(absexp, base->gettime());
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+ freezer_delta = delta;
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
+
+
+/**
+ * alarm_init - Initialize an alarm structure
+ * @alarm: ptr to alarm to be initialized
+ * @type: the type of the alarm
+ * @function: callback that is run when the alarm fires
+ */
+void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ timerqueue_init(&alarm->node);
+ alarm->function = function;
+ alarm->type = type;
+ alarm->state = ALARMTIMER_STATE_INACTIVE;
+}
+
+/**
+ * alarm_start - Sets an alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time to run the alarm
+ */
+void alarm_start(struct alarm *alarm, ktime_t start)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+
+ spin_lock_irqsave(&base->lock, flags);
+ if (alarmtimer_active(alarm))
+ alarmtimer_remove(base, alarm);
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+
+/**
+ * alarm_try_to_cancel - Tries to cancel an alarm timer
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not running,
+ * and -1 if the callback was running
+ */
+int alarm_try_to_cancel(struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+ int ret = -1;
+ spin_lock_irqsave(&base->lock, flags);
+
+ if (alarmtimer_callback_running(alarm))
+ goto out;
+
+ if (alarmtimer_is_queued(alarm)) {
+ alarmtimer_remove(base, alarm);
+ ret = 1;
+ } else
+ ret = 0;
+out:
+ spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
+}
+
+
+/**
+ * alarm_cancel - Spins trying to cancel an alarm timer until it is done
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not active.
+ */
+int alarm_cancel(struct alarm *alarm)
+{
+ for (;;) {
+ int ret = alarm_try_to_cancel(alarm);
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+
+
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
+{
+ u64 overrun = 1;
+ ktime_t delta;
+
+ delta = ktime_sub(now, alarm->node.expires);
+
+ if (delta.tv64 < 0)
+ return 0;
+
+ if (unlikely(delta.tv64 >= interval.tv64)) {
+ s64 incr = ktime_to_ns(interval);
+
+ overrun = ktime_divns(delta, incr);
+
+ alarm->node.expires = ktime_add_ns(alarm->node.expires,
+ incr*overrun);
+
+ if (alarm->node.expires.tv64 > now.tv64)
+ return overrun;
+ /*
+ * This (and the ktime_add() below) is the
+ * correction for exact:
+ */
+ overrun++;
+ }
+
+ alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ return overrun;
+}
+
+
+
+
+/**
+ * clock2alarm - helper that converts from clockid to alarmtypes
+ * @clockid: clockid.
+ */
+static enum alarmtimer_type clock2alarm(clockid_t clockid)
+{
+ if (clockid == CLOCK_REALTIME_ALARM)
+ return ALARM_REALTIME;
+ if (clockid == CLOCK_BOOTTIME_ALARM)
+ return ALARM_BOOTTIME;
+ return -1;
+}
+
+/**
+ * alarm_handle_timer - Callback for posix timers
+ * @alarm: alarm that fired
+ *
+ * Posix timer callback for expired alarm timers.
+ */
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+ ktime_t now)
+{
+ struct k_itimer *ptr = container_of(alarm, struct k_itimer,
+ it.alarm.alarmtimer);
+ if (posix_timer_event(ptr, 0) != 0)
+ ptr->it_overrun++;
+
+ /* Re-add periodic timers */
+ if (ptr->it.alarm.interval.tv64) {
+ ptr->it_overrun += alarm_forward(alarm, now,
+ ptr->it.alarm.interval);
+ return ALARMTIMER_RESTART;
+ }
+ return ALARMTIMER_NORESTART;
+}
+
+/**
+ * alarm_clock_getres - posix getres interface
+ * @which_clock: clockid
+ * @tp: timespec to fill
+ *
+ * Returns the granularity of underlying alarm base clock
+ */
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+ clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ return hrtimer_get_res(baseid, tp);
+}
+
+/**
+ * alarm_clock_get - posix clock_get interface
+ * @which_clock: clockid
+ * @tp: timespec to fill.
+ *
+ * Provides the underlying alarm base time.
+ */
+static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ *tp = ktime_to_timespec(base->gettime());
+ return 0;
+}
+
+/**
+ * alarm_timer_create - posix timer_create interface
+ * @new_timer: k_itimer pointer to manage
+ *
+ * Initializes the k_itimer structure.
+ */
+static int alarm_timer_create(struct k_itimer *new_timer)
+{
+ enum alarmtimer_type type;
+ struct alarm_base *base;
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ type = clock2alarm(new_timer->it_clock);
+ base = &alarm_bases[type];
+ alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
+ return 0;
+}
+
+/**
+ * alarm_timer_get - posix timer_get interface
+ * @new_timer: k_itimer pointer
+ * @cur_setting: itimerspec data to fill
+ *
+ * Copies the itimerspec data out from the k_itimer
+ */
+static void alarm_timer_get(struct k_itimer *timr,
+ struct itimerspec *cur_setting)
+{
+ memset(cur_setting, 0, sizeof(struct itimerspec));
+
+ cur_setting->it_interval =
+ ktime_to_timespec(timr->it.alarm.interval);
+ cur_setting->it_value =
+ ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
+ return;
+}
+
+/**
+ * alarm_timer_del - posix timer_del interface
+ * @timr: k_itimer pointer to be deleted
+ *
+ * Cancels any programmed alarms for the given timer.
+ */
+static int alarm_timer_del(struct k_itimer *timr)
+{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
+ if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+ return TIMER_RETRY;
+
+ return 0;
+}
+
+/**
+ * alarm_timer_set - posix timer_set interface
+ * @timr: k_itimer pointer to be deleted
+ * @flags: timer flags
+ * @new_setting: itimerspec to be used
+ * @old_setting: itimerspec being replaced
+ *
+ * Sets the timer to new_setting, and starts the timer.
+ */
+static int alarm_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting,
+ struct itimerspec *old_setting)
+{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
+ if (old_setting)
+ alarm_timer_get(timr, old_setting);
+
+ /* If the timer was already set, cancel it */
+ if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+ return TIMER_RETRY;
+
+ /* start the timer */
+ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+ alarm_start(&timr->it.alarm.alarmtimer,
+ timespec_to_ktime(new_setting->it_value));
+ return 0;
+}
+
+/**
+ * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
+ * @alarm: ptr to alarm that fired
+ *
+ * Wakes up the task that set the alarmtimer
+ */
+static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+ ktime_t now)
+{
+ struct task_struct *task = (struct task_struct *)alarm->data;
+
+ alarm->data = NULL;
+ if (task)
+ wake_up_process(task);
+ return ALARMTIMER_NORESTART;
+}
+
+/**
+ * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
+ * @alarm: ptr to alarmtimer
+ * @absexp: absolute expiration time
+ *
+ * Sets the alarm timer and sleeps until it is fired or interrupted.
+ */
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+{
+ alarm->data = (void *)current;
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ alarm_start(alarm, absexp);
+ if (likely(alarm->data))
+ schedule();
+
+ alarm_cancel(alarm);
+ } while (alarm->data && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+
+ return (alarm->data == NULL);
+}
+
+
+/**
+ * update_rmtp - Update remaining timespec value
+ * @exp: expiration time
+ * @type: timer type
+ * @rmtp: user pointer to remaining timepsec value
+ *
+ * Helper function that fills in rmtp value with time between
+ * now and the exp value
+ */
+static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
+ struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = ktime_sub(exp, alarm_bases[type].gettime());
+
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+
+}
+
+/**
+ * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
+ * @restart: ptr to restart block
+ *
+ * Handles restarted clock_nanosleep calls
+ */
+static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
+{
+ enum alarmtimer_type type = restart->nanosleep.clockid;
+ ktime_t exp;
+ struct timespec __user *rmtp;
+ struct alarm alarm;
+ int ret = 0;
+
+ exp.tv64 = restart->nanosleep.expires;
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ rmtp = restart->nanosleep.rmtp;
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+
+ /* The other values in restart are already filled in */
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ return ret;
+}
+
+/**
+ * alarm_timer_nsleep - alarmtimer nanosleep
+ * @which_clock: clockid
+ * @flags: determins abstime or relative
+ * @tsreq: requested sleep time (abs or rel)
+ * @rmtp: remaining sleep time saved
+ *
+ * Handles clock_nanosleep calls against _ALARM clockids
+ */
+static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsreq, struct timespec __user *rmtp)
+{
+ enum alarmtimer_type type = clock2alarm(which_clock);
+ struct alarm alarm;
+ ktime_t exp;
+ int ret = 0;
+ struct restart_block *restart;
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ exp = timespec_to_ktime(*tsreq);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now = alarm_bases[type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ /* abs timers don't set remaining time or restart */
+ if (flags == TIMER_ABSTIME) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ restart = &current_thread_info()->restart_block;
+ restart->fn = alarm_timer_nsleep_restart;
+ restart->nanosleep.clockid = type;
+ restart->nanosleep.expires = exp.tv64;
+ restart->nanosleep.rmtp = rmtp;
+ ret = -ERESTART_RESTARTBLOCK;
+
+out:
+ return ret;
+}
+
+
+/* Suspend hook structures */
+static const struct dev_pm_ops alarmtimer_pm_ops = {
+ .suspend = alarmtimer_suspend,
+ .resume = alarmtimer_resume,
+};
+
+static struct platform_driver alarmtimer_driver = {
+ .driver = {
+ .name = "alarmtimer",
+ .pm = &alarmtimer_pm_ops,
+ }
+};
+
+/**
+ * alarmtimer_init - Initialize alarm timer code
+ *
+ * This function initializes the alarm bases and registers
+ * the posix clock ids.
+ */
+static int __init alarmtimer_init(void)
+{
+ struct platform_device *pdev;
+ int error = 0;
+ int i;
+ struct k_clock alarm_clock = {
+ .clock_getres = alarm_clock_getres,
+ .clock_get = alarm_clock_get,
+ .timer_create = alarm_timer_create,
+ .timer_set = alarm_timer_set,
+ .timer_del = alarm_timer_del,
+ .timer_get = alarm_timer_get,
+ .nsleep = alarm_timer_nsleep,
+ };
+
+ alarmtimer_rtc_timer_init();
+
+ posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+ posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+
+ /* Initialize alarm bases */
+ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
+ alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
+ alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ timerqueue_init_head(&alarm_bases[i].timerqueue);
+ spin_lock_init(&alarm_bases[i].lock);
+ hrtimer_init(&alarm_bases[i].timer,
+ alarm_bases[i].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm_bases[i].timer.function = alarmtimer_fired;
+ }
+
+ error = alarmtimer_rtc_interface_setup();
+ if (error)
+ return error;
+
+ error = platform_driver_register(&alarmtimer_driver);
+ if (error)
+ goto out_if;
+
+ pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
+ if (IS_ERR(pdev)) {
+ error = PTR_ERR(pdev);
+ goto out_drv;
+ }
+ ws = wakeup_source_register("alarmtimer");
+ return 0;
+
+out_drv:
+ platform_driver_unregister(&alarmtimer_driver);
+out_if:
+ alarmtimer_rtc_interface_remove();
+ return error;
+}
+device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 00000000..9cd928f7
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,441 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event devices.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+
+#include "tick-internal.h"
+
+/* The registered clock event devices */
+static LIST_HEAD(clockevent_devices);
+static LIST_HEAD(clockevents_released);
+
+/* Notification for clock events */
+static RAW_NOTIFIER_HEAD(clockevents_chain);
+
+/* Protection for the above */
+static DEFINE_RAW_SPINLOCK(clockevents_lock);
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+{
+ u64 clc = (u64) latch << evt->shift;
+
+ if (unlikely(!evt->mult)) {
+ evt->mult = 1;
+ WARN_ON(1);
+ }
+
+ do_div(clc, evt->mult);
+ if (clc < 1000)
+ clc = 1000;
+ if (clc > KTIME_MAX)
+ clc = KTIME_MAX;
+
+ return clc;
+}
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+
+/**
+ * clockevents_set_mode - set the operating mode of a clock event device
+ * @dev: device to modify
+ * @mode: new mode
+ *
+ * Must be called with interrupts disabled !
+ */
+void clockevents_set_mode(struct clock_event_device *dev,
+ enum clock_event_mode mode)
+{
+ if (dev->mode != mode) {
+ dev->set_mode(mode, dev);
+ dev->mode = mode;
+
+ /*
+ * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+ * on it, so fix it up and emit a warning:
+ */
+ if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (unlikely(!dev->mult)) {
+ dev->mult = 1;
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev: device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ dev->next_event.tv64 = KTIME_MAX;
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
+
+/**
+ * clockevents_increase_min_delta - raise minimum delta of a clock event device
+ * @dev: device to increase the minimum delta
+ *
+ * Returns 0 on success, -ETIME when the minimum delta reached the limit.
+ */
+static int clockevents_increase_min_delta(struct clock_event_device *dev)
+{
+ /* Nothing to do if we already reached the limit */
+ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
+ printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+ dev->next_event.tv64 = KTIME_MAX;
+ return -ETIME;
+ }
+
+ if (dev->min_delta_ns < 5000)
+ dev->min_delta_ns = 5000;
+ else
+ dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+ if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+ dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+ printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
+ return 0;
+}
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ unsigned long long clc;
+ int64_t delta;
+ int i;
+
+ for (i = 0;;) {
+ delta = dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+ if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ return 0;
+
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ if (dev->set_next_event((unsigned long) clc, dev) == 0)
+ return 0;
+
+ if (++i > 2) {
+ /*
+ * We tried 3 times to program the device with the
+ * given min_delta_ns. Try to increase the minimum
+ * delta, if that fails as well get out of here.
+ */
+ if (clockevents_increase_min_delta(dev))
+ return -ETIME;
+ i = 0;
+ }
+ }
+}
+
+#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ unsigned long long clc;
+ int64_t delta;
+
+ delta = dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+ if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ return 0;
+
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ return dev->set_next_event((unsigned long) clc, dev);
+}
+
+#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_event - Reprogram the clock event device.
+ * @dev: device to program
+ * @expires: absolute expiry time (monotonic clock)
+ * @force: program minimum delay if expires can not be set
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
+ bool force)
+{
+ unsigned long long clc;
+ int64_t delta;
+ int rc;
+
+ if (unlikely(expires.tv64 < 0)) {
+ WARN_ON_ONCE(1);
+ return -ETIME;
+ }
+
+ dev->next_event = expires;
+
+ if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ return 0;
+
+ /* Shortcut for clockevent devices that can deal with ktime. */
+ if (dev->features & CLOCK_EVT_FEAT_KTIME)
+ return dev->set_next_ktime(expires, dev);
+
+ delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+ if (delta <= 0)
+ return force ? clockevents_program_min_delta(dev) : -ETIME;
+
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ delta = max(delta, (int64_t) dev->min_delta_ns);
+
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ rc = dev->set_next_event((unsigned long) clc, dev);
+
+ return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+}
+
+/**
+ * clockevents_register_notifier - register a clock events change listener
+ */
+int clockevents_register_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&clockevents_lock, flags);
+ ret = raw_notifier_chain_register(&clockevents_chain, nb);
+ raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Notify about a clock event change. Called with clockevents_lock
+ * held.
+ */
+static void clockevents_do_notify(unsigned long reason, void *dev)
+{
+ raw_notifier_call_chain(&clockevents_chain, reason, dev);
+}
+
+/*
+ * Called after a notify add to make devices available which were
+ * released from the notifier call.
+ */
+static void clockevents_notify_released(void)
+{
+ struct clock_event_device *dev;
+
+ while (!list_empty(&clockevents_released)) {
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ }
+}
+
+/**
+ * clockevents_register_device - register a clock event device
+ * @dev: device to register
+ */
+void clockevents_register_device(struct clock_event_device *dev)
+{
+ unsigned long flags;
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ if (!dev->cpumask) {
+ WARN_ON(num_possible_cpus() > 1);
+ dev->cpumask = cpumask_of(smp_processor_id());
+ }
+
+ raw_spin_lock_irqsave(&clockevents_lock, flags);
+
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ clockevents_notify_released();
+
+ raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+}
+EXPORT_SYMBOL_GPL(clockevents_register_device);
+
+static void clockevents_config(struct clock_event_device *dev,
+ u32 freq)
+{
+ u64 sec;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * Calculate the maximum number of seconds we can sleep. Limit
+ * to 10 minutes for hardware which can program more than
+ * 32bit ticks so we still get reasonable conversion values.
+ */
+ sec = dev->max_delta_ticks;
+ do_div(sec, freq);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
+ sec = 600;
+
+ clockevents_calc_mult_shift(dev, freq, sec);
+ dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
+ dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+}
+
+/**
+ * clockevents_config_and_register - Configure and register a clock event device
+ * @dev: device to register
+ * @freq: The clock frequency
+ * @min_delta: The minimum clock ticks to program in oneshot mode
+ * @max_delta: The maximum clock ticks to program in oneshot mode
+ *
+ * min/max_delta can be 0 for devices which do not support oneshot mode.
+ */
+void clockevents_config_and_register(struct clock_event_device *dev,
+ u32 freq, unsigned long min_delta,
+ unsigned long max_delta)
+{
+ dev->min_delta_ticks = min_delta;
+ dev->max_delta_ticks = max_delta;
+ clockevents_config(dev, freq);
+ clockevents_register_device(dev);
+}
+
+/**
+ * clockevents_update_freq - Update frequency and reprogram a clock event device.
+ * @dev: device to modify
+ * @freq: new device frequency
+ *
+ * Reconfigure and reprogram a clock event device in oneshot
+ * mode. Must be called on the cpu for which the device delivers per
+ * cpu timer events with interrupts disabled! Returns 0 on success,
+ * -ETIME when the event is in the past.
+ */
+int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ clockevents_config(dev, freq);
+
+ if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ return 0;
+
+ return clockevents_program_event(dev, dev->next_event, false);
+}
+
+/*
+ * Noop handler when we shut down an event device
+ */
+void clockevents_handle_noop(struct clock_event_device *dev)
+{
+}
+
+/**
+ * clockevents_exchange_device - release and request clock devices
+ * @old: device to release (can be NULL)
+ * @new: device to request (can be NULL)
+ *
+ * Called from the notifier chain. clockevents_lock is held already
+ */
+void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /*
+ * Caller releases a clock event device. We queue it into the
+ * released list and do a notify add later.
+ */
+ if (old) {
+ clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ list_del(&old->list);
+ list_add(&old->list, &clockevents_released);
+ }
+
+ if (new) {
+ BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ clockevents_shutdown(new);
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+/**
+ * clockevents_notify - notification about relevant events
+ */
+void clockevents_notify(unsigned long reason, void *arg)
+{
+ struct clock_event_device *dev, *tmp;
+ unsigned long flags;
+ int cpu;
+
+ raw_spin_lock_irqsave(&clockevents_lock, flags);
+ clockevents_do_notify(reason, arg);
+
+ switch (reason) {
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ cpu = *((int *)arg);
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ list_del(&dev->list);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+}
+EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 00000000..3a7d0fde
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,959 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ * o Allow clocksource drivers to be unregistered
+ */
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (cycle_delta > tc->cc->mask / 2) {
+ cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+ } else {
+ nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult: pointer to mult variable
+ * @shift: pointer to shift variable
+ * @from: frequency to convert from
+ * @to: frequency to convert to
+ * @maxsec: guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @maxsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
+{
+ u64 tmp;
+ u32 sft, sftacc= 32;
+
+ /*
+ * Calculate the shift factor which is limiting the conversion
+ * range:
+ */
+ tmp = ((u64)maxsec * from) >> 32;
+ while (tmp) {
+ tmp >>=1;
+ sftacc--;
+ }
+
+ /*
+ * Find the conversion shift/mult pair which has the best
+ * accuracy and fits the maxsec conversion range:
+ */
+ for (sft = 32; sft > 0; sft--) {
+ tmp = (u64) to << sft;
+ tmp += from / 2;
+ do_div(tmp, from);
+ if ((tmp >> sftacc) == 0)
+ break;
+ }
+ *mult = tmp;
+ *shift = sft;
+}
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ * currently selected clocksource.
+ * clocksource_list:
+ * linked list with the registered clocksources
+ * clocksource_mutex:
+ * protects manipulations to curr_clocksource and the clocksource_list
+ * override_name:
+ * Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_MUTEX(clocksource_mutex);
+static char override_name[32];
+static int finished_booting;
+
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static void clocksource_watchdog_work(struct work_struct *work);
+
+static LIST_HEAD(watchdog_list);
+static struct clocksource *watchdog;
+static struct timer_list watchdog_timer;
+static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
+static DEFINE_SPINLOCK(watchdog_lock);
+static int watchdog_running;
+static atomic_t watchdog_reset_pending;
+
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
+/*
+ * Interval: 0.5sec Threshold: 0.0625s
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+ /*
+ * If kthread_run fails the next watchdog scan over the
+ * watchdog_list will find the unstable clock again.
+ */
+ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
+static void __clocksource_unstable(struct clocksource *cs)
+{
+ cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+ cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+}
+
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
+ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+ cs->name, delta);
+ __clocksource_unstable(cs);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs: clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+ if (list_empty(&cs->wd_list))
+ list_add(&cs->wd_list, &watchdog_list);
+ __clocksource_unstable(cs);
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_watchdog(unsigned long data)
+{
+ struct clocksource *cs;
+ cycle_t csnow, wdnow;
+ int64_t wd_nsec, cs_nsec;
+ int next_cpu, reset_pending;
+
+ spin_lock(&watchdog_lock);
+ if (!watchdog_running)
+ goto out;
+
+ reset_pending = atomic_read(&watchdog_reset_pending);
+
+ list_for_each_entry(cs, &watchdog_list, wd_list) {
+
+ /* Clocksource already marked unstable? */
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+ continue;
+ }
+
+ local_irq_disable();
+ csnow = cs->read(cs);
+ wdnow = watchdog->read(watchdog);
+ local_irq_enable();
+
+ /* Clocksource initialized ? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+ atomic_read(&watchdog_reset_pending)) {
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ cs->wd_last = wdnow;
+ cs->cs_last = csnow;
+ continue;
+ }
+
+ wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
+ watchdog->mult, watchdog->shift);
+
+ cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
+ cs->mask, cs->mult, cs->shift);
+ cs->cs_last = csnow;
+ cs->wd_last = wdnow;
+
+ if (atomic_read(&watchdog_reset_pending))
+ continue;
+
+ /* Check the deviation from the watchdog clocksource. */
+ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+ clocksource_unstable(cs, cs_nsec - wd_nsec);
+ continue;
+ }
+
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /*
+ * We just marked the clocksource as highres-capable,
+ * notify the rest of the system as well so that we
+ * transition into high-res mode:
+ */
+ tick_clock_notify();
+ }
+ }
+
+ /*
+ * We only clear the watchdog_reset_pending, when we did a
+ * full cycle through all clocksources.
+ */
+ if (reset_pending)
+ atomic_dec(&watchdog_reset_pending);
+
+ /*
+ * Cycle through CPUs to check if the CPUs stay synchronized
+ * to each other.
+ */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+out:
+ spin_unlock(&watchdog_lock);
+}
+
+static inline void clocksource_start_watchdog(void)
+{
+ if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ return;
+ init_timer(&watchdog_timer);
+ watchdog_timer.function = clocksource_watchdog;
+ watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+ watchdog_running = 1;
+}
+
+static inline void clocksource_stop_watchdog(void)
+{
+ if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ return;
+ del_timer(&watchdog_timer);
+ watchdog_running = 0;
+}
+
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+static void clocksource_resume_watchdog(void)
+{
+ atomic_inc(&watchdog_reset_pending);
+}
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a clocksource to be watched. */
+ list_add(&cs->wd_list, &watchdog_list);
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ } else {
+ /* cs is a watchdog. */
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /* Pick the best watchdog. */
+ if (!watchdog || cs->rating > watchdog->rating) {
+ watchdog = cs;
+ /* Reset watchdog cycles */
+ clocksource_reset_watchdog();
+ }
+ }
+ /* Check if the watchdog timer needs to be started. */
+ clocksource_start_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_dequeue_watchdog(struct clocksource *cs)
+{
+ struct clocksource *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a watched clocksource. */
+ list_del_init(&cs->wd_list);
+ } else if (cs == watchdog) {
+ /* Reset watchdog cycles */
+ clocksource_reset_watchdog();
+ /* Current watchdog is removed. Find an alternative. */
+ watchdog = NULL;
+ list_for_each_entry(tmp, &clocksource_list, list) {
+ if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+ if (!watchdog || tmp->rating > watchdog->rating)
+ watchdog = tmp;
+ }
+ }
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static int clocksource_watchdog_kthread(void *data)
+{
+ struct clocksource *cs, *tmp;
+ unsigned long flags;
+ LIST_HEAD(unstable);
+
+ mutex_lock(&clocksource_mutex);
+ spin_lock_irqsave(&watchdog_lock, flags);
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ list_del_init(&cs->wd_list);
+ list_add(&cs->wd_list, &unstable);
+ }
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+
+ /* Needs to be done outside of watchdog lock */
+ list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
+ list_del_init(&cs->wd_list);
+ __clocksource_change_rating(cs, 0);
+ }
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+
+#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
+{
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+}
+
+static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
+static inline void clocksource_resume_watchdog(void) { }
+static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+
+#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+/**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry_reverse(cs, &clocksource_list, list)
+ if (cs->suspend)
+ cs->suspend(cs);
+}
+
+/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &clocksource_list, list)
+ if (cs->resume)
+ cs->resume(cs);
+
+ clocksource_resume_watchdog();
+}
+
+/**
+ * clocksource_touch_watchdog - Update watchdog
+ *
+ * Update the watchdog after exception contexts such as kgdb so as not
+ * to incorrectly trip the watchdog. This might fail when the kernel
+ * was stopped in code which holds watchdog_lock.
+ */
+void clocksource_touch_watchdog(void)
+{
+ clocksource_resume_watchdog();
+}
+
+/**
+ * clocksource_max_adjustment- Returns max adjustment amount
+ * @cs: Pointer to clocksource
+ *
+ */
+static u32 clocksource_max_adjustment(struct clocksource *cs)
+{
+ u64 ret;
+ /*
+ * We won't try to correct for more than 11% adjustments (110,000 ppm),
+ */
+ ret = (u64)cs->mult * 11;
+ do_div(ret,100);
+ return (u32)ret;
+}
+
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs: Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+ u64 max_nsecs, max_cycles;
+
+ /*
+ * Calculate the maximum number of cycles that we can pass to the
+ * cyc2ns function without overflowing a 64-bit signed result. The
+ * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+ * which is equivalent to the below.
+ * max_cycles < (2^63)/(cs->mult + cs->maxadj)
+ * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
+ * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
+ * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
+ * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+ * Please note that we add 1 to the result of the log2 to account for
+ * any rounding errors, ensure the above inequality is satisfied and
+ * no overflow will occur.
+ */
+ max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+
+ /*
+ * The actual maximum number of cycles we can defer the clocksource is
+ * determined by the minimum of max_cycles and cs->mask.
+ * Note: Here we subtract the maxadj to make sure we don't sleep for
+ * too long if there's a large negative adjustment.
+ */
+ max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+ max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+ cs->shift);
+
+ /*
+ * To ensure that the clocksource does not wrap whilst we are idle,
+ * limit the time the clocksource can be deferred by 12.5%. Please
+ * note a margin of 12.5% is used because this can be computed with
+ * a shift, versus say 10% which would require division.
+ */
+ return max_nsecs - (max_nsecs >> 3);
+}
+
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
+
+/**
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
+ *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
+ */
+static void clocksource_select(void)
+{
+ struct clocksource *best, *cs;
+
+ if (!finished_booting || list_empty(&clocksource_list))
+ return;
+ /* First clocksource on the list has the best rating. */
+ best = list_first_entry(&clocksource_list, struct clocksource, list);
+ /* Check for the override clocksource. */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (strcmp(cs->name, override_name) != 0)
+ continue;
+ /*
+ * Check to make sure we don't switch to a non-highres
+ * capable clocksource if the tick code is in oneshot
+ * mode (highres or nohz)
+ */
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ tick_oneshot_mode_active()) {
+ /* Override clocksource cannot be used. */
+ printk(KERN_WARNING "Override clocksource %s is not "
+ "HRT compatible. Cannot switch while in "
+ "HRT/NOHZ mode\n", cs->name);
+ override_name[0] = 0;
+ } else
+ /* Override clocksource can be used. */
+ best = cs;
+ break;
+ }
+ if (curr_clocksource != best) {
+ //printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+ curr_clocksource = best;
+ timekeeping_notify(curr_clocksource);
+ }
+}
+
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
+
+static inline void clocksource_select(void) { }
+
+#endif
+
+/*
+ * clocksource_done_booting - Called near the end of core bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
+ */
+static int __init clocksource_done_booting(void)
+{
+ mutex_lock(&clocksource_mutex);
+ curr_clocksource = clocksource_default_clock();
+ mutex_unlock(&clocksource_mutex);
+
+ finished_booting = 1;
+
+ /*
+ * Run the watchdog first to eliminate unstable clock sources
+ */
+ clocksource_watchdog_kthread(NULL);
+
+ mutex_lock(&clocksource_mutex);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+fs_initcall(clocksource_done_booting);
+
+/*
+ * Enqueue the clocksource sorted by rating
+ */
+static void clocksource_enqueue(struct clocksource *cs)
+{
+ struct list_head *entry = &clocksource_list;
+ struct clocksource *tmp;
+
+ list_for_each_entry(tmp, &clocksource_list, list)
+ /* Keep track of the place, where to insert */
+ if (tmp->rating >= cs->rating)
+ entry = &tmp->list;
+ list_add(&cs->list, entry);
+}
+
+/**
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * @cs: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * This should only be called from the clocksource->enable() method.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ */
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+ u64 sec;
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
+ * margin as we do in clocksource_max_deferment()
+ */
+ sec = (cs->mask - (cs->mask >> 3));
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+
+ /*
+ * for clocksources that have large mults, to avoid overflow.
+ * Since mult may be adjusted by ntp, add an safety extra margin
+ *
+ */
+ cs->maxadj = clocksource_max_adjustment(cs);
+ while ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult)) {
+ cs->mult >>= 1;
+ cs->shift--;
+ cs->maxadj = clocksource_max_adjustment(cs);
+ }
+
+ cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @cs: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+ /* Initialize mult/shift and max_idle_ns */
+ __clocksource_updatefreq_scale(cs, scale, freq);
+
+ /* Add clocksource to the clcoksource list */
+ mutex_lock(&clocksource_mutex);
+ clocksource_enqueue(cs);
+ clocksource_enqueue_watchdog(cs);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__clocksource_register_scale);
+
+
+/**
+ * clocksource_register - Used to install new clocksources
+ * @cs: clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int clocksource_register(struct clocksource *cs)
+{
+ /* calculate max adjustment for given mult/shift */
+ cs->maxadj = clocksource_max_adjustment(cs);
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ /* calculate max idle time permitted for this clocksource */
+ cs->max_idle_ns = clocksource_max_deferment(cs);
+
+ mutex_lock(&clocksource_mutex);
+ clocksource_enqueue(cs);
+ clocksource_enqueue_watchdog(cs);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(clocksource_register);
+
+static void __clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+ clocksource_select();
+}
+
+/**
+ * clocksource_change_rating - Change the rating of a registered clocksource
+ * @cs: clocksource to be changed
+ * @rating: new rating
+ */
+void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ mutex_lock(&clocksource_mutex);
+ __clocksource_change_rating(cs, rating);
+ mutex_unlock(&clocksource_mutex);
+}
+EXPORT_SYMBOL(clocksource_change_rating);
+
+/**
+ * clocksource_unregister - remove a registered clocksource
+ * @cs: clocksource to be unregistered
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+ mutex_lock(&clocksource_mutex);
+ clocksource_dequeue_watchdog(cs);
+ list_del(&cs->list);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+}
+EXPORT_SYMBOL(clocksource_unregister);
+
+#ifdef CONFIG_SYSFS
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t count = 0;
+
+ mutex_lock(&clocksource_mutex);
+ count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ mutex_unlock(&clocksource_mutex);
+
+ return count;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: name of override clocksource
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selection.
+ */
+static ssize_t sysfs_override_clocksource(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ size_t ret = count;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(override_name))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+
+ mutex_lock(&clocksource_mutex);
+
+ if (count > 0)
+ memcpy(override_name, buf, count);
+ override_name[count] = 0;
+ clocksource_select();
+
+ mutex_unlock(&clocksource_mutex);
+
+ return ret;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct clocksource *src;
+ ssize_t count = 0;
+
+ mutex_lock(&clocksource_mutex);
+ list_for_each_entry(src, &clocksource_list, list) {
+ /*
+ * Don't show non-HRES clocksource if the tick code is
+ * in one shot mode (highres=on or nohz=on)
+ */
+ if (!tick_oneshot_mode_active() ||
+ (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+ "%s ", src->name);
+ }
+ mutex_unlock(&clocksource_mutex);
+
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
+
+ return count;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+ sysfs_override_clocksource);
+
+static DEVICE_ATTR(available_clocksource, 0444,
+ sysfs_show_available_clocksources, NULL);
+
+static struct bus_type clocksource_subsys = {
+ .name = "clocksource",
+ .dev_name = "clocksource",
+};
+
+static struct device device_clocksource = {
+ .id = 0,
+ .bus = &clocksource_subsys,
+};
+
+static int __init init_clocksource_sysfs(void)
+{
+ int error = subsys_system_register(&clocksource_subsys, NULL);
+
+ if (!error)
+ error = device_register(&device_clocksource);
+ if (!error)
+ error = device_create_file(
+ &device_clocksource,
+ &dev_attr_current_clocksource);
+ if (!error)
+ error = device_create_file(
+ &device_clocksource,
+ &dev_attr_available_clocksource);
+ return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str: override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+ mutex_lock(&clocksource_mutex);
+ if (str)
+ strlcpy(override_name, str, sizeof(override_name));
+ mutex_unlock(&clocksource_mutex);
+ return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str: override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+ if (!strcmp(str, "pmtmr")) {
+ printk("Warning: clock=pmtmr is deprecated. "
+ "Use clocksource=acpi_pm.\n");
+ return boot_override_clocksource("acpi_pm");
+ }
+ printk("Warning! clock= boot option is deprecated. "
+ "Use clocksource=xyz\n");
+ return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 00000000..a470154e
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,97 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "tick-internal.h"
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not recommended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater than 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT 8
+
+static cycle_t jiffies_read(struct clocksource *cs)
+{
+ return (cycle_t) jiffies;
+}
+
+struct clocksource clocksource_jiffies = {
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .read = jiffies_read,
+ .mask = 0xffffffff, /*32bits*/
+ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+};
+
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ ret = jiffies_64;
+ } while (read_seqretry(&xtime_lock, seq));
+ return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
+static int __init init_jiffies_clocksource(void)
+{
+ return clocksource_register(&clocksource_jiffies);
+}
+
+core_initcall(init_jiffies_clocksource);
+
+struct clocksource * __init __weak clocksource_default_clock(void)
+{
+ return &clocksource_jiffies;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 00000000..e8c86717
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,965 @@
+/*
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+#include <linux/capability.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+/*
+ * NTP timekeeping variables:
+ */
+
+DEFINE_SPINLOCK(ntp_lock);
+
+
+/* USER_HZ period (usecs): */
+unsigned long tick_usec = TICK_USEC;
+
+/* ACTHZ period (nsecs): */
+unsigned long tick_nsec;
+
+static u64 tick_length;
+static u64 tick_length_base;
+
+#define MAX_TICKADJ 500LL /* usecs */
+#define MAX_TICKADJ_SCALED \
+ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+
+/*
+ * phase-lock loop variables
+ */
+
+/*
+ * clock synchronization status
+ *
+ * (TIME_ERROR prevents overwriting the CMOS clock)
+ */
+static int time_state = TIME_OK;
+
+/* clock status bits: */
+static int time_status = STA_UNSYNC;
+
+/* TAI offset (secs): */
+static long time_tai;
+
+/* time adjustment (nsecs): */
+static s64 time_offset;
+
+/* pll time constant: */
+static long time_constant = 2;
+
+/* maximum error (usecs): */
+static long time_maxerror = NTP_PHASE_LIMIT;
+
+/* estimated error (usecs): */
+static long time_esterror = NTP_PHASE_LIMIT;
+
+/* frequency offset (scaled nsecs/secs): */
+static s64 time_freq;
+
+/* time at last adjustment (secs): */
+static long time_reftime;
+
+static long time_adjust;
+
+/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
+static s64 ntp_tick_adj;
+
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID 10 /* PPS signal watchdog max (s) */
+#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
+#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
+#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
+ increase pps_shift or consecutive bad
+ intervals to decrease it */
+#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
+
+static int pps_valid; /* signal watchdog counter */
+static long pps_tf[3]; /* phase median filter */
+static long pps_jitter; /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift; /* current interval duration (s) (shift) */
+static int pps_intcnt; /* interval counter */
+static s64 pps_freq; /* frequency offset (scaled ns/s) */
+static long pps_stabil; /* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ return offset;
+ else
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+ /* the PPS calibration interval may end
+ surprisingly early */
+ pps_shift = PPS_INTMIN;
+ pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the ntp_lock
+ */
+static inline void pps_clear(void)
+{
+ pps_reset_freq_interval();
+ pps_tf[0] = 0;
+ pps_tf[1] = 0;
+ pps_tf[2] = 0;
+ pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+ pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the ntp_lock
+ */
+static inline void pps_dec_valid(void)
+{
+ if (pps_valid > 0)
+ pps_valid--;
+ else {
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ pps_clear();
+ }
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+ pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+ return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ /* PPS signal lost when either PPS time or
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(time_status & STA_PPSSIGNAL))
+ /* PPS jitter exceeded when
+ * PPS time synchronization requested */
+ || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ == (STA_PPSTIME|STA_PPSJITTER))
+ /* PPS wander exceeded or calibration error when
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & STA_PPSFREQ)
+ && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->jitter = pps_jitter;
+ if (!(time_status & STA_NANO))
+ txc->jitter /= NSEC_PER_USEC;
+ txc->shift = pps_shift;
+ txc->stabil = pps_stabil;
+ txc->jitcnt = pps_jitcnt;
+ txc->calcnt = pps_calcnt;
+ txc->errcnt = pps_errcnt;
+ txc->stbcnt = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+ return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+ return !(time_status & STA_UNSYNC);
+}
+
+
+/*
+ * NTP methods:
+ */
+
+/*
+ * Update (tick_length, tick_length_base, tick_nsec), based
+ * on (tick_usec, ntp_tick_adj, time_freq):
+ */
+static void ntp_update_frequency(void)
+{
+ u64 second_length;
+ u64 new_base;
+
+ second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+ << NTP_SCALE_SHIFT;
+
+ second_length += ntp_tick_adj;
+ second_length += time_freq;
+
+ tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
+ new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
+
+ /*
+ * Don't wait for the next second_overflow, apply
+ * the change to the tick length immediately:
+ */
+ tick_length += new_base - tick_length_base;
+ tick_length_base = new_base;
+}
+
+static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
+{
+ time_status &= ~STA_MODE;
+
+ if (secs < MINSEC)
+ return 0;
+
+ if (!(time_status & STA_FLL) && (secs <= MAXSEC))
+ return 0;
+
+ time_status |= STA_MODE;
+
+ return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+}
+
+static void ntp_update_offset(long offset)
+{
+ s64 freq_adj;
+ s64 offset64;
+ long secs;
+
+ if (!(time_status & STA_PLL))
+ return;
+
+ if (!(time_status & STA_NANO))
+ offset *= NSEC_PER_USEC;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ offset = min(offset, MAXPHASE);
+ offset = max(offset, -MAXPHASE);
+
+ /*
+ * Select how the frequency is to be controlled
+ * and in which mode (PLL or FLL).
+ */
+ secs = get_seconds() - time_reftime;
+ if (unlikely(time_status & STA_FREQHOLD))
+ secs = 0;
+
+ time_reftime = get_seconds();
+
+ offset64 = offset;
+ freq_adj = ntp_update_offset_fll(offset64, secs);
+
+ /*
+ * Clamp update interval to reduce PLL gain with low
+ * sampling rate (e.g. intermittent network connection)
+ * to avoid instability.
+ */
+ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+ secs = 1 << (SHIFT_PLL + 1 + time_constant);
+
+ freq_adj += (offset64 * secs) <<
+ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+
+ freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
+
+ time_freq = max(freq_adj, -MAXFREQ_SCALED);
+
+ time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ */
+void ntp_clear(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ntp_lock, flags);
+
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+
+ ntp_update_frequency();
+
+ tick_length = tick_length_base;
+ time_offset = 0;
+
+ /* Clear PPS state variables */
+ pps_clear();
+ spin_unlock_irqrestore(&ntp_lock, flags);
+
+}
+
+
+u64 ntp_tick_length(void)
+{
+ unsigned long flags;
+ s64 ret;
+
+ spin_lock_irqsave(&ntp_lock, flags);
+ ret = tick_length;
+ spin_unlock_irqrestore(&ntp_lock, flags);
+ return ret;
+}
+
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
+ */
+int second_overflow(unsigned long secs)
+{
+ s64 delta;
+ int leap = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ntp_lock, flags);
+
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
+ switch (time_state) {
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+ case TIME_INS:
+ if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ time_tai++;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
+ break;
+ case TIME_DEL:
+ if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_tai--;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
+ break;
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ }
+
+
+ /* Bump the maxerror field */
+ time_maxerror += MAXFREQ / NSEC_PER_USEC;
+ if (time_maxerror > NTP_PHASE_LIMIT) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /* Compute the phase adjustment for the next second */
+ tick_length = tick_length_base;
+
+ delta = ntp_offset_chunk(time_offset);
+ time_offset -= delta;
+ tick_length += delta;
+
+ /* Check PPS signal */
+ pps_dec_valid();
+
+ if (!time_adjust)
+ goto out;
+
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ goto out;
+ }
+
+ if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ goto out;
+ }
+
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
+ << NTP_SCALE_SHIFT;
+ time_adjust = 0;
+
+
+
+out:
+ spin_unlock_irqrestore(&ntp_lock, flags);
+
+ return leap;
+}
+
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+
+static void sync_cmos_clock(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
+
+static void sync_cmos_clock(struct work_struct *work)
+{
+ struct timespec now, next;
+ int fail = 1;
+
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+ * called as close as possible to 500 ms before the new second starts.
+ * This code is run on a timer. If the clock is set, that timer
+ * may not expire at the correct time. Thus, we adjust...
+ */
+ if (!ntp_synced()) {
+ /*
+ * Not synced, exit, do not restart a timer (if one is
+ * running, let it run out).
+ */
+ return;
+ }
+
+ getnstimeofday(&now);
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+ fail = update_persistent_clock(now);
+
+ next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
+ if (next.tv_nsec <= 0)
+ next.tv_nsec += NSEC_PER_SEC;
+
+ if (!fail)
+ next.tv_sec = 659;
+ else
+ next.tv_sec = 0;
+
+ if (next.tv_nsec >= NSEC_PER_SEC) {
+ next.tv_sec++;
+ next.tv_nsec -= NSEC_PER_SEC;
+ }
+ schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
+}
+
+static void notify_cmos_timer(void)
+{
+ schedule_delayed_work(&sync_cmos_work, 0);
+}
+
+#else
+static inline void notify_cmos_timer(void) { }
+#endif
+
+
+/*
+ * Propagate a new txc->status value into the NTP state:
+ */
+static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+{
+ if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+ /* restart PPS frequency calibration */
+ pps_reset_freq_interval();
+ }
+
+ /*
+ * If we turn on PLL adjustments then reset the
+ * reference time to current time.
+ */
+ if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
+ time_reftime = get_seconds();
+
+ /* only set allowed bits */
+ time_status &= STA_RONLY;
+ time_status |= txc->status & ~STA_RONLY;
+
+}
+/*
+ * Called with the xtime lock held, so we can access and modify
+ * all the global NTP state:
+ */
+static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+{
+ if (txc->modes & ADJ_STATUS)
+ process_adj_status(txc, ts);
+
+ if (txc->modes & ADJ_NANO)
+ time_status |= STA_NANO;
+
+ if (txc->modes & ADJ_MICRO)
+ time_status &= ~STA_NANO;
+
+ if (txc->modes & ADJ_FREQUENCY) {
+ time_freq = txc->freq * PPM_SCALE;
+ time_freq = min(time_freq, MAXFREQ_SCALED);
+ time_freq = max(time_freq, -MAXFREQ_SCALED);
+ /* update pps_freq */
+ pps_set_freq(time_freq);
+ }
+
+ if (txc->modes & ADJ_MAXERROR)
+ time_maxerror = txc->maxerror;
+
+ if (txc->modes & ADJ_ESTERROR)
+ time_esterror = txc->esterror;
+
+ if (txc->modes & ADJ_TIMECONST) {
+ time_constant = txc->constant;
+ if (!(time_status & STA_NANO))
+ time_constant += 4;
+ time_constant = min(time_constant, (long)MAXTC);
+ time_constant = max(time_constant, 0l);
+ }
+
+ if (txc->modes & ADJ_TAI && txc->constant > 0)
+ time_tai = txc->constant;
+
+ if (txc->modes & ADJ_OFFSET)
+ ntp_update_offset(txc->offset);
+
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+}
+
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+ struct timespec ts;
+ int result;
+
+ /* Validate the data before disabling interrupts */
+ if (txc->modes & ADJ_ADJTIME) {
+ /* singleshot must not be used with any other mode bits */
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /*
+ * if the quartz is off by more than 10% then
+ * something is VERY wrong!
+ */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+ }
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ result = timekeeping_inject_offset(&delta);
+ if (result)
+ return result;
+ }
+
+ getnstimeofday(&ts);
+
+ spin_lock_irq(&ntp_lock);
+
+ if (txc->modes & ADJ_ADJTIME) {
+ long save_adjust = time_adjust;
+
+ if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ ntp_update_frequency();
+ }
+ txc->offset = save_adjust;
+ } else {
+
+ /* If there are input parameters, then process them: */
+ if (txc->modes)
+ process_adjtimex_modes(txc, &ts);
+
+ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ NTP_SCALE_SHIFT);
+ if (!(time_status & STA_NANO))
+ txc->offset /= NSEC_PER_USEC;
+ }
+
+ result = time_state; /* mostly `TIME_OK' */
+ /* check for errors */
+ if (is_error_status(time_status))
+ result = TIME_ERROR;
+
+ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = 1;
+ txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
+ txc->tick = tick_usec;
+ txc->tai = time_tai;
+
+ /* fill PPS status fields */
+ pps_fill_timex(txc);
+
+ spin_unlock_irq(&ntp_lock);
+
+ txc->time.tv_sec = ts.tv_sec;
+ txc->time.tv_usec = ts.tv_nsec;
+ if (!(time_status & STA_NANO))
+ txc->time.tv_usec /= NSEC_PER_USEC;
+
+ notify_cmos_timer();
+
+ return result;
+}
+
+#ifdef CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+ __kernel_time_t sec; /* seconds */
+ long nsec; /* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+ ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+ struct pps_normtime norm = {
+ .sec = ts.tv_sec,
+ .nsec = ts.tv_nsec
+ };
+
+ if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+ norm.nsec -= NSEC_PER_SEC;
+ norm.sec++;
+ }
+
+ return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+ *jitter = pps_tf[0] - pps_tf[1];
+ if (*jitter < 0)
+ *jitter = -*jitter;
+
+ /* TODO: test various filters */
+ return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+ if (--pps_intcnt <= -PPS_INTCOUNT) {
+ pps_intcnt = -PPS_INTCOUNT;
+ if (pps_shift > PPS_INTMIN) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+ if (++pps_intcnt >= PPS_INTCOUNT) {
+ pps_intcnt = PPS_INTCOUNT;
+ if (pps_shift < PPS_INTMAX) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+ long delta, delta_mod;
+ s64 ftemp;
+
+ /* check if the frequency interval was too long */
+ if (freq_norm.sec > (2 << pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ pps_dec_freq_interval();
+ pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
+ return 0;
+ }
+
+ /* here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * the interval is increased; otherwise it is decreased.
+ */
+ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+ freq_norm.sec);
+ delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+ pps_freq = ftemp;
+ if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+ pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ pps_dec_freq_interval();
+ } else { /* good sample */
+ pps_inc_freq_interval();
+ }
+
+ /* the stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring
+ */
+ delta_mod = delta;
+ if (delta_mod < 0)
+ delta_mod = -delta_mod;
+ pps_stabil += (div_s64(((s64)delta_mod) <<
+ (NTP_SCALE_SHIFT - SHIFT_USEC),
+ NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+ /* if enabled, the system clock frequency is updated */
+ if ((time_status & STA_PPSFREQ) != 0 &&
+ (time_status & STA_FREQHOLD) == 0) {
+ time_freq = pps_freq;
+ ntp_update_frequency();
+ }
+
+ return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+ long correction = -error;
+ long jitter;
+
+ /* add the sample to the median filter */
+ pps_phase_filter_add(correction);
+ correction = pps_phase_filter_get(&jitter);
+
+ /* Nominal jitter is due to PPS signal noise. If it exceeds the
+ * threshold, the sample is discarded; otherwise, if so enabled,
+ * the time offset is updated.
+ */
+ if (jitter > (pps_jitter << PPS_POPCORN)) {
+ pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ /* correct the time using the phase offset */
+ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+ NTP_INTERVAL_FREQ);
+ /* cancel running adjtime() */
+ time_adjust = 0;
+ }
+ /* update jitter */
+ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ struct pps_normtime pts_norm, freq_norm;
+ unsigned long flags;
+
+ pts_norm = pps_normalize_ts(*phase_ts);
+
+ spin_lock_irqsave(&ntp_lock, flags);
+
+ /* clear the error bits, they will be set again if needed */
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+ /* indicate signal presence */
+ time_status |= STA_PPSSIGNAL;
+ pps_valid = PPS_VALID;
+
+ /* when called for the first time,
+ * just start the frequency interval */
+ if (unlikely(pps_fbase.tv_sec == 0)) {
+ pps_fbase = *raw_ts;
+ spin_unlock_irqrestore(&ntp_lock, flags);
+ return;
+ }
+
+ /* ok, now we have a base for frequency calculation */
+ freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+ /* check that the signal is in the range
+ * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+ if ((freq_norm.sec == 0) ||
+ (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+ (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+ time_status |= STA_PPSJITTER;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ spin_unlock_irqrestore(&ntp_lock, flags);
+ pr_err("hardpps: PPSJITTER: bad pulse\n");
+ return;
+ }
+
+ /* signal is ok */
+
+ /* check if the current frequency interval is finished */
+ if (freq_norm.sec >= (1 << pps_shift)) {
+ pps_calcnt++;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ hardpps_update_freq(freq_norm);
+ }
+
+ hardpps_update_phase(pts_norm.nsec);
+
+ spin_unlock_irqrestore(&ntp_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+
+#endif /* CONFIG_NTP_PPS */
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ ntp_tick_adj = simple_strtol(str, NULL, 0);
+ ntp_tick_adj <<= NTP_SCALE_SHIFT;
+
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
+
+void __init ntp_init(void)
+{
+ ntp_clear();
+}
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 00000000..ce033c7a
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,446 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+
+ down_read(&clk->rwsem);
+
+ if (!clk->zombie)
+ return clk;
+
+ up_read(&clk->rwsem);
+
+ return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+ up_read(&clk->rwsem);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -EINVAL;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.read)
+ err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int result = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.poll)
+ result = clk->ops.poll(clk, fp, wait);
+
+ put_posix_clock(clk);
+
+ return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.fasync)
+ err = clk->ops.fasync(clk, fd, fp, on);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENODEV;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.mmap)
+ err = clk->ops.mmap(clk, vma);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+ int err;
+ struct posix_clock *clk =
+ container_of(inode->i_cdev, struct posix_clock, cdev);
+
+ down_read(&clk->rwsem);
+
+ if (clk->zombie) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (clk->ops.open)
+ err = clk->ops.open(clk, fp->f_mode);
+ else
+ err = 0;
+
+ if (!err) {
+ kref_get(&clk->kref);
+ fp->private_data = clk;
+ }
+out:
+ up_read(&clk->rwsem);
+ return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+ int err = 0;
+
+ if (clk->ops.release)
+ err = clk->ops.release(clk);
+
+ kref_put(&clk->kref, delete_clock);
+
+ fp->private_data = NULL;
+
+ return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = posix_clock_read,
+ .poll = posix_clock_poll,
+ .unlocked_ioctl = posix_clock_ioctl,
+ .open = posix_clock_open,
+ .release = posix_clock_release,
+ .fasync = posix_clock_fasync,
+ .mmap = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+ int err;
+
+ kref_init(&clk->kref);
+ init_rwsem(&clk->rwsem);
+
+ cdev_init(&clk->cdev, &posix_clock_file_operations);
+ clk->cdev.owner = clk->ops.owner;
+ err = cdev_add(&clk->cdev, devid, 1);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+ struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+
+ if (clk->release)
+ clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+ cdev_del(&clk->cdev);
+
+ down_write(&clk->rwsem);
+ clk->zombie = true;
+ up_write(&clk->rwsem);
+
+ kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+ struct file *fp;
+ struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+ struct file *fp = fget(CLOCKID_TO_FD(id));
+ int err = -EINVAL;
+
+ if (!fp)
+ return err;
+
+ if (fp->f_op->open != posix_clock_open || !fp->private_data)
+ goto out;
+
+ cd->fp = fp;
+ cd->clk = get_posix_clock(fp);
+
+ err = cd->clk ? 0 : -ENODEV;
+out:
+ if (err)
+ fput(fp);
+ return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+ put_posix_clock(cd->clk);
+ fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_adjtime)
+ err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_gettime)
+ err = cd.clk->ops.clock_gettime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_getres)
+ err = cd.clk->ops.clock_getres(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_settime)
+ err = cd.clk->ops.clock_settime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_create)
+ err = cd.clk->ops.timer_create(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_delete)
+ err = cd.clk->ops.timer_delete(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+
+ if (get_clock_desc(id, &cd))
+ return;
+
+ if (cd.clk->ops.timer_gettime)
+ cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+ put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+ struct itimerspec *ts, struct itimerspec *old)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_settime)
+ err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
+ .timer_create = pc_timer_create,
+ .timer_set = pc_timer_settime,
+ .timer_del = pc_timer_delete,
+ .timer_get = pc_timer_gettime,
+};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 00000000..f1137556
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,629 @@
+/*
+ * linux/kernel/time/tick-broadcast.c
+ *
+ * This file contains functions which emulate a local clock-event
+ * device via a broadcast event source.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+
+#include "tick-internal.h"
+
+/*
+ * Broadcast support for broken x86 hardware, where the local apic
+ * timer stops in C3 state.
+ */
+
+static struct tick_device tick_broadcast_device;
+/* FIXME: Use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
+static DECLARE_BITMAP(tmpmask, NR_CPUS);
+static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_force;
+
+#ifdef CONFIG_TICK_ONESHOT
+static void tick_broadcast_clear_oneshot(int cpu);
+#else
+static inline void tick_broadcast_clear_oneshot(int cpu) { }
+#endif
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_broadcast_device(void)
+{
+ return &tick_broadcast_device;
+}
+
+struct cpumask *tick_get_broadcast_mask(void)
+{
+ return to_cpumask(tick_broadcast_mask);
+}
+
+/*
+ * Start the device in periodic mode
+ */
+static void tick_broadcast_start_periodic(struct clock_event_device *bc)
+{
+ if (bc)
+ tick_setup_periodic(bc, 1);
+}
+
+/*
+ * Check, if the device can be utilized as broadcast device:
+ */
+int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ if ((tick_broadcast_device.evtdev &&
+ tick_broadcast_device.evtdev->rating >= dev->rating) ||
+ (dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ tick_broadcast_device.evtdev = dev;
+ if (!cpumask_empty(tick_get_broadcast_mask()))
+ tick_broadcast_start_periodic(dev);
+ return 1;
+}
+
+/*
+ * Check, if the device is the broadcast device
+ */
+int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return (dev && tick_broadcast_device.evtdev == dev);
+}
+
+/*
+ * Check, if the device is disfunctional and a place holder, which
+ * needs to be handled by the broadcast device.
+ */
+int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Devices might be registered with both periodic and oneshot
+ * mode disabled. This signals, that the device needs to be
+ * operated from the broadcast device and is a placeholder for
+ * the cpu local device.
+ */
+ if (!tick_device_is_functional(dev)) {
+ dev->event_handler = tick_handle_periodic;
+ cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+ ret = 1;
+ } else {
+ /*
+ * When the new device is not affected by the stop
+ * feature and the cpu is marked in the broadcast mask
+ * then clear the broadcast bit.
+ */
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
+ int cpu = smp_processor_id();
+
+ cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ tick_broadcast_clear_oneshot(cpu);
+ }
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
+}
+
+/*
+ * Broadcast the event to the cpus, which are set in the mask (mangled).
+ */
+static void tick_do_broadcast(struct cpumask *mask)
+{
+ int cpu = smp_processor_id();
+ struct tick_device *td;
+
+ /*
+ * Check, if the current cpu is in the mask
+ */
+ if (cpumask_test_cpu(cpu, mask)) {
+ cpumask_clear_cpu(cpu, mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->event_handler(td->evtdev);
+ }
+
+ if (!cpumask_empty(mask)) {
+ /*
+ * It might be necessary to actually check whether the devices
+ * have different broadcast functions. For now, just use the
+ * one of the first device. This works as long as we have this
+ * misfeature only on x86 (lapic)
+ */
+ td = &per_cpu(tick_cpu_device, cpumask_first(mask));
+ td->evtdev->broadcast(mask);
+ }
+}
+
+/*
+ * Periodic broadcast:
+ * - invoke the broadcast handlers
+ */
+static void tick_do_periodic_broadcast(void)
+{
+ raw_spin_lock(&tick_broadcast_lock);
+
+ cpumask_and(to_cpumask(tmpmask),
+ cpu_online_mask, tick_get_broadcast_mask());
+ tick_do_broadcast(to_cpumask(tmpmask));
+
+ raw_spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Event handler for periodic broadcast ticks
+ */
+static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
+{
+ ktime_t next;
+
+ tick_do_periodic_broadcast();
+
+ /*
+ * The device is in periodic mode. No reprogramming necessary:
+ */
+ if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ return;
+
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode. We read dev->next_event first and add to it
+ * when the event already expired. clockevents_program_event()
+ * sets dev->next_event only when the event is really
+ * programmed to the device.
+ */
+ for (next = dev->next_event; ;) {
+ next = ktime_add(next, tick_period);
+
+ if (!clockevents_program_event(dev, next, false))
+ return;
+ tick_do_periodic_broadcast();
+ }
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+static void tick_do_broadcast_on_off(unsigned long *reason)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags;
+ int cpu, bc_stopped;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+ bc = tick_broadcast_device.evtdev;
+
+ /*
+ * Is the device not affected by the powerstate ?
+ */
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ goto out;
+
+ if (!tick_device_is_functional(dev))
+ goto out;
+
+ bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+
+ switch (*reason) {
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+ cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ clockevents_shutdown(dev);
+ }
+ if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
+ tick_broadcast_force = 1;
+ break;
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ if (!tick_broadcast_force &&
+ cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+ cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(dev, 0);
+ }
+ break;
+ }
+
+ if (cpumask_empty(tick_get_broadcast_mask())) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
+out:
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop.
+ */
+void tick_broadcast_on_off(unsigned long reason, int *oncpu)
+{
+ if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
+ printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
+ "offline CPU #%d\n", *oncpu);
+ else
+ tick_do_broadcast_on_off(&reason);
+}
+
+/*
+ * Set the periodic handler depending on broadcast on/off
+ */
+void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ if (!broadcast)
+ dev->event_handler = tick_handle_periodic;
+ else
+ dev->event_handler = tick_handle_periodic_broadcast;
+}
+
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_shutdown_broadcast(unsigned int *cpup)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ if (bc && cpumask_empty(tick_get_broadcast_mask()))
+ clockevents_shutdown(bc);
+ }
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+void tick_suspend_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ clockevents_shutdown(bc);
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+int tick_resume_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ int broadcast = 0;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc) {
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_PERIODIC:
+ if (!cpumask_empty(tick_get_broadcast_mask()))
+ tick_broadcast_start_periodic(bc);
+ broadcast = cpumask_test_cpu(smp_processor_id(),
+ tick_get_broadcast_mask());
+ break;
+ case TICKDEV_MODE_ONESHOT:
+ if (!cpumask_empty(tick_get_broadcast_mask()))
+ broadcast = tick_resume_broadcast_oneshot(bc);
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+
+ return broadcast;
+}
+
+
+#ifdef CONFIG_TICK_ONESHOT
+
+/* FIXME: use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+
+/*
+ * Exposed for debugging: see timer_list.c
+ */
+struct cpumask *tick_get_broadcast_oneshot_mask(void)
+{
+ return to_cpumask(tick_broadcast_oneshot_mask);
+}
+
+static int tick_broadcast_set_event(ktime_t expires, int force)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
+ return clockevents_program_event(bc, expires, force);
+}
+
+int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ return 0;
+}
+
+/*
+ * Called from irq_enter() when idle was interrupted to reenable the
+ * per cpu device.
+ */
+void tick_check_oneshot_broadcast(int cpu)
+{
+ if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ }
+}
+
+/*
+ * Handle oneshot mode broadcasting
+ */
+static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
+{
+ struct tick_device *td;
+ ktime_t now, next_event;
+ int cpu;
+
+ raw_spin_lock(&tick_broadcast_lock);
+again:
+ dev->next_event.tv64 = KTIME_MAX;
+ next_event.tv64 = KTIME_MAX;
+ cpumask_clear(to_cpumask(tmpmask));
+ now = ktime_get();
+ /* Find all expired events */
+ for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev->next_event.tv64 <= now.tv64)
+ cpumask_set_cpu(cpu, to_cpumask(tmpmask));
+ else if (td->evtdev->next_event.tv64 < next_event.tv64)
+ next_event.tv64 = td->evtdev->next_event.tv64;
+ }
+
+ /*
+ * Wakeup the cpus which have an expired event.
+ */
+ tick_do_broadcast(to_cpumask(tmpmask));
+
+ /*
+ * Two reasons for reprogram:
+ *
+ * - The global event did not expire any CPU local
+ * events. This happens in dyntick mode, as the maximum PIT
+ * delta is quite small.
+ *
+ * - There are pending events on sleeping CPUs which were not
+ * in the event mask
+ */
+ if (next_event.tv64 != KTIME_MAX) {
+ /*
+ * Rearm the broadcast device. If event expired,
+ * repeat the above
+ */
+ if (tick_broadcast_set_event(next_event, 0))
+ goto again;
+ }
+ raw_spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+void tick_broadcast_oneshot_control(unsigned long reason)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags;
+ int cpu;
+
+ /*
+ * Periodic mode does not care about the enter/exit of power
+ * states
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ return;
+
+ /*
+ * We are called with preemtion disabled from the depth of the
+ * idle code, so we can't be moved away.
+ */
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return;
+
+ bc = tick_broadcast_device.evtdev;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+ cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(dev->next_event, 1);
+ }
+ } else {
+ if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+ cpumask_clear_cpu(cpu,
+ tick_get_broadcast_oneshot_mask());
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ if (dev->next_event.tv64 != KTIME_MAX)
+ tick_program_event(dev->next_event, 1);
+ }
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Reset the one shot broadcast for a cpu
+ *
+ * Called with tick_broadcast_lock held
+ */
+static void tick_broadcast_clear_oneshot(int cpu)
+{
+ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+}
+
+static void tick_broadcast_init_next_event(struct cpumask *mask,
+ ktime_t expires)
+{
+ struct tick_device *td;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev)
+ td->evtdev->next_event = expires;
+ }
+}
+
+/**
+ * tick_broadcast_setup_oneshot - setup the broadcast device
+ */
+void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ int cpu = smp_processor_id();
+
+ /* Set it up only once ! */
+ if (bc->event_handler != tick_handle_oneshot_broadcast) {
+ int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+
+ /* Take the do_timer update */
+ tick_do_timer_cpu = cpu;
+
+ /*
+ * We must be careful here. There might be other CPUs
+ * waiting for periodic broadcast. We need to set the
+ * oneshot_mask bits for those and program the
+ * broadcast device to fire.
+ */
+ cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
+ cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
+ cpumask_or(tick_get_broadcast_oneshot_mask(),
+ tick_get_broadcast_oneshot_mask(),
+ to_cpumask(tmpmask));
+
+ if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ tick_broadcast_init_next_event(to_cpumask(tmpmask),
+ tick_next_period);
+ tick_broadcast_set_event(tick_next_period, 1);
+ } else
+ bc->next_event.tv64 = KTIME_MAX;
+ } else {
+ /*
+ * The first cpu which switches to oneshot mode sets
+ * the bit for all other cpus which are in the general
+ * (periodic) broadcast mask. So the bit is set and
+ * would prevent the first broadcast enter after this
+ * to program the bc device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
+ }
+}
+
+/*
+ * Select oneshot operating mode for the broadcast device
+ */
+void tick_broadcast_switch_to_oneshot(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ tick_broadcast_setup_oneshot(bc);
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+
+/*
+ * Remove a dead CPU from broadcasting
+ */
+void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+{
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Clear the broadcast mask flag for the dead cpu, but do not
+ * stop the broadcast device!
+ */
+ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+ return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
+#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 00000000..da6c9eca
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,419 @@
+/*
+ * linux/kernel/time/tick-common.c
+ *
+ * This file contains the base functions to manage periodic tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+/*
+ * Tick devices
+ */
+DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+/*
+ * Tick next event: keeps track of the tick time
+ */
+ktime_t tick_next_period;
+ktime_t tick_period;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+static DEFINE_RAW_SPINLOCK(tick_device_lock);
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_device(int cpu)
+{
+ return &per_cpu(tick_cpu_device, cpu);
+}
+
+/**
+ * tick_is_oneshot_available - check for a oneshot capable event device
+ */
+int tick_is_oneshot_available(void)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return 0;
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 1;
+ return tick_broadcast_oneshot_available();
+}
+
+/*
+ * Periodic tick
+ */
+static void tick_periodic(int cpu)
+{
+ if (tick_do_timer_cpu == cpu) {
+ write_seqlock(&xtime_lock);
+
+ /* Keep track of the next tick event */
+ tick_next_period = ktime_add(tick_next_period, tick_period);
+
+ do_timer(1);
+ write_sequnlock(&xtime_lock);
+ }
+
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
+
+/*
+ * Event handler for periodic ticks
+ */
+void tick_handle_periodic(struct clock_event_device *dev)
+{
+ int cpu = smp_processor_id();
+ ktime_t next;
+
+ tick_periodic(cpu);
+
+ if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ return;
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ next = ktime_add(dev->next_event, tick_period);
+ for (;;) {
+ if (!clockevents_program_event(dev, next, false))
+ return;
+ /*
+ * Have to be careful here. If we're in oneshot mode,
+ * before we call tick_periodic() in a loop, we need
+ * to be sure we're using a real hardware clocksource.
+ * Otherwise we could get trapped in an infinite
+ * loop, as the tick_periodic() increments jiffies,
+ * when then will increment time, posibly causing
+ * the loop to trigger again and again.
+ */
+ if (timekeeping_valid_for_hres())
+ tick_periodic(cpu);
+ next = ktime_add(next, tick_period);
+ }
+}
+
+/*
+ * Setup the device for a periodic tick
+ */
+void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
+{
+ tick_set_periodic_handler(dev, broadcast);
+
+ /* Broadcast setup ? */
+ if (!tick_device_is_functional(dev))
+ return;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !tick_broadcast_oneshot_active()) {
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ } else {
+ unsigned long seq;
+ ktime_t next;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ next = tick_next_period;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+
+ for (;;) {
+ if (!clockevents_program_event(dev, next, false))
+ return;
+ next = ktime_add(next, tick_period);
+ }
+ }
+}
+
+/*
+ * Setup the tick device
+ */
+static void tick_setup_device(struct tick_device *td,
+ struct clock_event_device *newdev, int cpu,
+ const struct cpumask *cpumask)
+{
+ ktime_t next_event;
+ void (*handler)(struct clock_event_device *) = NULL;
+
+ /*
+ * First device setup ?
+ */
+ if (!td->evtdev) {
+ /*
+ * If no cpu took the do_timer update, assign it to
+ * this cpu:
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ tick_do_timer_cpu = cpu;
+ tick_next_period = ktime_get();
+ tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ }
+
+ /*
+ * Startup in periodic mode first.
+ */
+ td->mode = TICKDEV_MODE_PERIODIC;
+ } else {
+ handler = td->evtdev->event_handler;
+ next_event = td->evtdev->next_event;
+ td->evtdev->event_handler = clockevents_handle_noop;
+ }
+
+ td->evtdev = newdev;
+
+ /*
+ * When the device is not per cpu, pin the interrupt to the
+ * current cpu:
+ */
+ if (!cpumask_equal(newdev->cpumask, cpumask))
+ irq_set_affinity(newdev->irq, cpumask);
+
+ /*
+ * When global broadcasting is active, check if the current
+ * device is registered as a placeholder for broadcast mode.
+ * This allows us to handle this x86 misfeature in a generic
+ * way.
+ */
+ if (tick_device_uses_broadcast(newdev, cpu))
+ return;
+
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(newdev, 0);
+ else
+ tick_setup_oneshot(newdev, handler, next_event);
+}
+
+/*
+ * Check, if the new registered device should be used.
+ */
+static int tick_check_new_device(struct clock_event_device *newdev)
+{
+ struct clock_event_device *curdev;
+ struct tick_device *td;
+ int cpu, ret = NOTIFY_OK;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_device_lock, flags);
+
+ cpu = smp_processor_id();
+ if (!cpumask_test_cpu(cpu, newdev->cpumask))
+ goto out_bc;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ curdev = td->evtdev;
+
+ /* cpu local device ? */
+ if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
+
+ /*
+ * If the cpu affinity of the device interrupt can not
+ * be set, ignore it.
+ */
+ if (!irq_can_set_affinity(newdev->irq))
+ goto out_bc;
+
+ /*
+ * If we have a cpu local device already, do not replace it
+ * by a non cpu local device
+ */
+ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
+ goto out_bc;
+ }
+
+ /*
+ * If we have an active device, then check the rating and the oneshot
+ * feature.
+ */
+ if (curdev) {
+ /*
+ * Prefer one shot capable devices !
+ */
+ if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ goto out_bc;
+ /*
+ * Check the rating
+ */
+ if (curdev->rating >= newdev->rating)
+ goto out_bc;
+ }
+
+ /*
+ * Replace the eventually existing device by the new
+ * device. If the current device is the broadcast device, do
+ * not give it back to the clockevents layer !
+ */
+ if (tick_is_broadcast_device(curdev)) {
+ clockevents_shutdown(curdev);
+ curdev = NULL;
+ }
+ clockevents_exchange_device(curdev, newdev);
+ tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+
+ raw_spin_unlock_irqrestore(&tick_device_lock, flags);
+ return NOTIFY_STOP;
+
+out_bc:
+ /*
+ * Can the new device be used as a broadcast device ?
+ */
+ if (tick_check_broadcast_device(newdev))
+ ret = NOTIFY_STOP;
+
+ raw_spin_unlock_irqrestore(&tick_device_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = cpumask_first(cpu_online_mask);
+
+ tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+}
+
+/*
+ * Shutdown an event device on a given cpu:
+ *
+ * This is called on a life CPU, when a CPU is dead. So we cannot
+ * access the hardware device itself.
+ * We just set the mode and remove it from the lists.
+ */
+static void tick_shutdown(unsigned int *cpup)
+{
+ struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct clock_event_device *dev = td->evtdev;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_device_lock, flags);
+ td->mode = TICKDEV_MODE_PERIODIC;
+ if (dev) {
+ /*
+ * Prevent that the clock events layer tries to call
+ * the set mode function!
+ */
+ dev->mode = CLOCK_EVT_MODE_UNUSED;
+ clockevents_exchange_device(dev, NULL);
+ td->evtdev = NULL;
+ }
+ raw_spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+static void tick_suspend(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_device_lock, flags);
+ clockevents_shutdown(td->evtdev);
+ raw_spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+static void tick_resume(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ unsigned long flags;
+ int broadcast = tick_resume_broadcast();
+
+ raw_spin_lock_irqsave(&tick_device_lock, flags);
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+
+ if (!broadcast) {
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(td->evtdev, 0);
+ else
+ tick_resume_oneshot();
+ }
+ raw_spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+/*
+ * Notification about clock event devices
+ */
+static int tick_notify(struct notifier_block *nb, unsigned long reason,
+ void *dev)
+{
+ switch (reason) {
+
+ case CLOCK_EVT_NOTIFY_ADD:
+ return tick_check_new_device(dev);
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ tick_broadcast_on_off(reason, dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+ case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+ tick_broadcast_oneshot_control(reason);
+ break;
+
+ case CLOCK_EVT_NOTIFY_CPU_DYING:
+ tick_handover_do_timer(dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ tick_shutdown_broadcast_oneshot(dev);
+ tick_shutdown_broadcast(dev);
+ tick_shutdown(dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_SUSPEND:
+ tick_suspend();
+ tick_suspend_broadcast();
+ break;
+
+ case CLOCK_EVT_NOTIFY_RESUME:
+ tick_resume();
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tick_notifier = {
+ .notifier_call = tick_notify,
+};
+
+/**
+ * tick_init - initialize the tick control
+ *
+ * Register the notifier with the clockevents framework
+ */
+void __init tick_init(void)
+{
+ clockevents_register_notifier(&tick_notifier);
+}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 00000000..4e265b90
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,144 @@
+/*
+ * tick internal variable and functions used by low/high res code
+ */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
+
+#define TICK_DO_TIMER_NONE -1
+#define TICK_DO_TIMER_BOOT -2
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern ktime_t tick_next_period;
+extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
+
+extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
+extern void tick_handle_periodic(struct clock_event_device *dev);
+
+extern void clockevents_shutdown(struct clock_event_device *dev);
+
+/*
+ * NO_HZ / high resolution timer shared code
+ */
+#ifdef CONFIG_TICK_ONESHOT
+extern void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt);
+extern int tick_program_event(ktime_t expires, int force);
+extern void tick_oneshot_notify(void);
+extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
+extern void tick_resume_oneshot(void);
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern void tick_broadcast_switch_to_oneshot(void);
+extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
+extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
+# else /* BROADCAST */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_broadcast_switch_to_oneshot(void) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
+# endif /* !BROADCAST */
+
+#else /* !ONESHOT */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt)
+{
+ BUG();
+}
+static inline void tick_resume_oneshot(void)
+{
+ BUG();
+}
+static inline int tick_program_event(ktime_t expires, int force)
+{
+ return 0;
+}
+static inline void tick_oneshot_notify(void) { }
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ return 0;
+}
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
+#endif /* !TICK_ONESHOT */
+
+/*
+ * Broadcasting support
+ */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
+extern void tick_shutdown_broadcast(unsigned int *cpup);
+extern void tick_suspend_broadcast(void);
+extern int tick_resume_broadcast(void);
+
+extern void
+tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+
+#else /* !BROADCAST */
+
+static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+
+static inline int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
+ int cpu)
+{
+ return 0;
+}
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
+static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline int tick_resume_broadcast(void) { return 0; }
+
+/*
+ * Set the periodic handler in non broadcast mode
+ */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev,
+ int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+#endif /* !BROADCAST */
+
+/*
+ * Check, if the device is functional or a dummy for broadcast
+ */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+
+#endif
+
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 00000000..82410906
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,116 @@
+/*
+ * linux/kernel/time/tick-oneshot.c
+ *
+ * This file contains functions which manage high resolution tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+
+#include "tick-internal.h"
+
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ return clockevents_program_event(dev, expires, force);
+}
+
+/**
+ * tick_resume_onshot - resume oneshot mode
+ */
+void tick_resume_oneshot(void)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_program_event(dev, ktime_get(), true);
+}
+
+/**
+ * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ */
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t next_event)
+{
+ newdev->event_handler = handler;
+ clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_program_event(newdev, next_event, true);
+}
+
+/**
+ * tick_switch_to_oneshot - switch to oneshot mode
+ */
+int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
+ !tick_device_is_functional(dev)) {
+
+ printk(KERN_INFO "Clockevents: "
+ "could not switch to one-shot mode:");
+ if (!dev) {
+ printk(" no tick device\n");
+ } else {
+ if (!tick_device_is_functional(dev))
+ printk(" %s is not functional.\n", dev->name);
+ else
+ printk(" %s does not support one-shot mode.\n",
+ dev->name);
+ }
+ return -EINVAL;
+ }
+
+ td->mode = TICKDEV_MODE_ONESHOT;
+ dev->event_handler = handler;
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ tick_broadcast_switch_to_oneshot();
+ return 0;
+}
+
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * tick_init_highres - switch to high resolution mode
+ *
+ * Called with interrupts disabled.
+ */
+int tick_init_highres(void)
+{
+ return tick_switch_to_oneshot(hrtimer_interrupt);
+}
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 00000000..6a3a5b9f
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,912 @@
+/*
+ * linux/kernel/time/tick-sched.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * No idle tick implementation for low and high resolution timers
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * Distribute under GPLv2.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+/*
+ * Per cpu nohz control structure
+ */
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+
+/*
+ * The time, when the last jiffy update happened. Protected by xtime_lock.
+ */
+static ktime_t last_jiffies_update;
+
+struct tick_sched *tick_get_tick_sched(int cpu)
+{
+ return &per_cpu(tick_cpu_sched, cpu);
+}
+
+/*
+ * Must be called with interrupts disabled !
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+ unsigned long ticks = 0;
+ ktime_t delta;
+
+ /*
+ * Do a quick check without holding xtime_lock:
+ */
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 < tick_period.tv64)
+ return;
+
+ /* Reevalute with xtime_lock held */
+ write_seqlock(&xtime_lock);
+
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 >= tick_period.tv64) {
+
+ delta = ktime_sub(delta, tick_period);
+ last_jiffies_update = ktime_add(last_jiffies_update,
+ tick_period);
+
+ /* Slow path for long timeouts */
+ if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ s64 incr = ktime_to_ns(tick_period);
+
+ ticks = ktime_divns(delta, incr);
+
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ }
+ do_timer(++ticks);
+
+ /* Keep the tick_next_period variable up to date */
+ tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ }
+ write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Initialize and return retrieve the jiffies update.
+ */
+static ktime_t tick_init_jiffy_update(void)
+{
+ ktime_t period;
+
+ write_seqlock(&xtime_lock);
+ /* Did we start the jiffies update yet ? */
+ if (last_jiffies_update.tv64 == 0)
+ last_jiffies_update = tick_next_period;
+ period = last_jiffies_update;
+ write_sequnlock(&xtime_lock);
+ return period;
+}
+
+/*
+ * NOHZ - aka dynamic tick functionality
+ */
+#ifdef CONFIG_NO_HZ
+/*
+ * NO HZ enabled ?
+ */
+static int tick_nohz_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable tickless mode
+ */
+static int __init setup_tick_nohz(char *str)
+{
+ if (!strcmp(str, "off"))
+ tick_nohz_enabled = 0;
+ else if (!strcmp(str, "on"))
+ tick_nohz_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("nohz=", setup_tick_nohz);
+
+/**
+ * tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ *
+ * Called from interrupt entry when the CPU was idle
+ *
+ * In case the sched_tick was stopped on this CPU, we have to check if jiffies
+ * must be updated. Otherwise an interrupt handler could use a stale jiffy
+ * value. We do this unconditionally on any cpu, as we don't know whether the
+ * cpu, which has the update task assigned is in a long sleep.
+ */
+static void tick_nohz_update_jiffies(ktime_t now)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long flags;
+
+ ts->idle_waketime = now;
+
+ local_irq_save(flags);
+ tick_do_update_jiffies64(now);
+ local_irq_restore(flags);
+
+ touch_softlockup_watchdog();
+}
+
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+ ktime_t delta;
+
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ if (nr_iowait_cpu(cpu) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_entrytime = now;
+ }
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+}
+
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ update_ts_time_stats(cpu, ts, now, NULL);
+ ts->idle_active = 0;
+
+ sched_clock_idle_wakeup_event(0);
+}
+
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
+{
+ ktime_t now = ktime_get();
+
+ ts->idle_entrytime = now;
+ ts->idle_active = 1;
+ sched_clock_idle_sleep_event();
+ return now;
+}
+
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now, idle;
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time) {
+ update_ts_time_stats(cpu, ts, now, last_update_time);
+ idle = ts->idle_sleeptime;
+ } else {
+ if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(ts->idle_sleeptime, delta);
+ } else {
+ idle = ts->idle_sleeptime;
+ }
+ }
+
+ return ktime_to_us(idle);
+
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now, iowait;
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time) {
+ update_ts_time_stats(cpu, ts, now, last_update_time);
+ iowait = ts->iowait_sleeptime;
+ } else {
+ if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ iowait = ktime_add(ts->iowait_sleeptime, delta);
+ } else {
+ iowait = ts->iowait_sleeptime;
+ }
+ }
+
+ return ktime_to_us(iowait);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
+{
+ unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+ ktime_t last_update, expires, now;
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ u64 time_delta;
+ int cpu;
+
+ cpu = smp_processor_id();
+ ts = &per_cpu(tick_cpu_sched, cpu);
+
+ now = tick_nohz_start_idle(cpu, ts);
+
+ /*
+ * If this cpu is offline and it is the one which updates
+ * jiffies, then give up the assignment and let it be taken by
+ * the cpu which runs the tick timer next. If we don't drop
+ * this here the jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (unlikely(!cpu_online(cpu))) {
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ }
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ return;
+
+ if (need_resched())
+ return;
+
+ if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ static int ratelimit;
+
+ if (ratelimit < 10) {
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
+ ratelimit++;
+ }
+ return;
+ }
+
+ ts->idle_calls++;
+ /* Read jiffies and the time when jiffies were updated last */
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ last_update = last_jiffies_update;
+ last_jiffies = jiffies;
+ time_delta = timekeeping_max_deferment();
+ } while (read_seqretry(&xtime_lock, seq));
+
+ if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+ arch_needs_cpu(cpu)) {
+ next_jiffies = last_jiffies + 1;
+ delta_jiffies = 1;
+ } else {
+ /* Get the next timer wheel timer */
+ next_jiffies = get_next_timer_interrupt(last_jiffies);
+ delta_jiffies = next_jiffies - last_jiffies;
+ }
+ /*
+ * Do not stop the tick, if we are only one off
+ * or if the cpu is required for rcu
+ */
+ if (!ts->tick_stopped && delta_jiffies == 1)
+ goto out;
+
+ /* Schedule the tick, if we are at least one jiffie off */
+ if ((long)delta_jiffies >= 1) {
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked. Keep track of the fact that it was the one
+ * which had the do_timer() duty last. If this cpu is
+ * the one which had the do_timer() duty last, we
+ * limit the sleep time to the timekeeping
+ * max_deferement value which we retrieved
+ * above. Otherwise we can sleep as long as we want.
+ */
+ if (cpu == tick_do_timer_cpu) {
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ ts->do_timer_last = 1;
+ } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ time_delta = KTIME_MAX;
+ ts->do_timer_last = 0;
+ } else if (!ts->do_timer_last) {
+ time_delta = KTIME_MAX;
+ }
+
+ /*
+ * calculate the expiry time for the next timer wheel
+ * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+ * that there is no timer pending or at least extremely
+ * far into the future (12 days for HZ=1000). In this
+ * case we set the expiry to the end of time.
+ */
+ if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+ /*
+ * Calculate the time delta for the next timer event.
+ * If the time delta exceeds the maximum time delta
+ * permitted by the current clocksource then adjust
+ * the time delta accordingly to ensure the
+ * clocksource does not wrap.
+ */
+ time_delta = min_t(u64, time_delta,
+ tick_period.tv64 * delta_jiffies);
+ }
+
+ if (time_delta < KTIME_MAX)
+ expires = ktime_add_ns(last_update, time_delta);
+ else
+ expires.tv64 = KTIME_MAX;
+
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+ goto out;
+
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ select_nohz_load_balancer(1);
+
+ ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ ts->idle_jiffies = last_jiffies;
+ }
+
+ ts->idle_sleeps++;
+
+ /* Mark expires */
+ ts->idle_expires = expires;
+
+ /*
+ * If the expiration time == KTIME_MAX, then
+ * in this case we simply stop the tick timer.
+ */
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
+ }
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer, expires,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ goto out;
+ } else if (!tick_program_event(expires, 0))
+ goto out;
+ /*
+ * We are past the event already. So we crossed a
+ * jiffie boundary. Update jiffies and raise the
+ * softirq.
+ */
+ tick_do_update_jiffies64(ktime_get());
+ }
+ raise_softirq_irqoff(TIMER_SOFTIRQ);
+out:
+ ts->next_jiffies = next_jiffies;
+ ts->last_jiffies = last_jiffies;
+ ts->sleep_length = ktime_sub(dev->next_event, now);
+}
+
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ * to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+ struct tick_sched *ts;
+
+ WARN_ON_ONCE(irqs_disabled());
+
+ /*
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
+ set_cpu_sd_state_idle();
+
+ local_irq_disable();
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+ /*
+ * set ts->inidle unconditionally. even if the system did not
+ * switch to nohz mode the cpu frequency governers rely on the
+ * update of the idle time accounting in tick_nohz_start_idle().
+ */
+ ts->inidle = 1;
+ tick_nohz_stop_sched_tick(ts);
+
+ local_irq_enable();
+}
+
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!ts->inidle)
+ return;
+
+ tick_nohz_stop_sched_tick(ts);
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the length of the current sleep
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_sleep_length(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ return ts->sleep_length;
+}
+
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+
+ while (1) {
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ } else {
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
+ break;
+ }
+ /* Reread time and update jiffies */
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+/**
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
+ */
+void tick_nohz_idle_exit(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ unsigned long ticks;
+#endif
+ ktime_t now;
+
+ local_irq_disable();
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ ts->inidle = 0;
+
+ if (ts->idle_active || ts->tick_stopped)
+ now = ktime_get();
+
+ if (ts->idle_active)
+ tick_nohz_stop_idle(cpu, now);
+
+ if (!ts->tick_stopped) {
+ local_irq_enable();
+ return;
+ }
+
+ /* Update jiffies first */
+ select_nohz_load_balancer(0);
+ tick_do_update_jiffies64(now);
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX)
+ account_idle_ticks(ticks);
+#endif
+
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+
+ local_irq_enable();
+}
+
+static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
+}
+
+/*
+ * The nohz low res interrupt handler
+ */
+static void tick_nohz_handler(struct clock_event_device *dev)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct pt_regs *regs = get_irq_regs();
+ int cpu = smp_processor_id();
+ ktime_t now = ktime_get();
+
+ dev->next_event.tv64 = KTIME_MAX;
+
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ tick_do_timer_cpu = cpu;
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start
+ * of idle" jiffy stamp so the idle accounting adjustment we
+ * do when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+
+ while (tick_nohz_reprogram(ts, now)) {
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+/**
+ * tick_nohz_switch_to_nohz - switch to nohz mode
+ */
+static void tick_nohz_switch_to_nohz(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t next;
+
+ if (!tick_nohz_enabled)
+ return;
+
+ local_irq_disable();
+ if (tick_switch_to_oneshot(tick_nohz_handler)) {
+ local_irq_enable();
+ return;
+ }
+
+ ts->nohz_mode = NOHZ_MODE_LOWRES;
+
+ /*
+ * Recycle the hrtimer in ts, so we can share the
+ * hrtimer_forward with the highres code.
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ /* Get the next period */
+ next = tick_init_jiffy_update();
+
+ for (;;) {
+ hrtimer_set_expires(&ts->sched_timer, next);
+ if (!tick_program_event(next, 0))
+ break;
+ next = ktime_add(next, tick_period);
+ }
+ local_irq_enable();
+}
+
+/*
+ * When NOHZ is enabled and the tick is stopped, we need to kick the
+ * tick timer from irq_enter() so that the jiffies update is kept
+ * alive during long running softirqs. That's ugly as hell, but
+ * correctness is key even if we need to fix the offending softirq in
+ * the first place.
+ *
+ * Note, this is different to tick_nohz_restart. We just kick the
+ * timer and do not touch the other magic bits which need to be done
+ * when idle is left.
+ */
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
+{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t delta;
+
+ /*
+ * Do not touch the tick device, when the next expiry is either
+ * already reached or less/equal than the tick period.
+ */
+ delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ if (delta.tv64 <= tick_period.tv64)
+ return;
+
+ tick_nohz_restart(ts, now);
+#endif
+}
+
+static inline void tick_check_nohz(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now;
+
+ if (!ts->idle_active && !ts->tick_stopped)
+ return;
+ now = ktime_get();
+ if (ts->idle_active)
+ tick_nohz_stop_idle(cpu, now);
+ if (ts->tick_stopped) {
+ tick_nohz_update_jiffies(now);
+ tick_nohz_kick_tick(cpu, now);
+ }
+}
+
+#else
+
+static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
+
+#endif /* NO_HZ */
+
+/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+ tick_check_oneshot_broadcast(cpu);
+ tick_check_nohz(cpu);
+}
+
+/*
+ * High resolution timer specific code
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * We rearm the timer until we get disabled by the idle code.
+ * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ */
+static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+{
+ struct tick_sched *ts =
+ container_of(timer, struct tick_sched, sched_timer);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ tick_do_timer_cpu = cpu;
+#endif
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+
+ /*
+ * Do not call, when we are not in irq context and have
+ * no valid regs pointer
+ */
+ if (regs) {
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start of
+ * idle" jiffy stamp so the idle accounting adjustment we do
+ * when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+ }
+
+ hrtimer_forward(timer, now, tick_period);
+
+ return HRTIMER_RESTART;
+}
+
+/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+void tick_setup_sched_timer(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t now = ktime_get();
+
+ /*
+ * Emulate tick processing via per-CPU hrtimers:
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ts->sched_timer.function = tick_sched_timer;
+
+ /* Get the next period (per cpu) */
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+
+ for (;;) {
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ now = ktime_get();
+ }
+
+#ifdef CONFIG_NO_HZ
+ if (tick_nohz_enabled)
+ ts->nohz_mode = NOHZ_MODE_HIGHRES;
+#endif
+}
+#endif /* HIGH_RES_TIMERS */
+
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+void tick_cancel_sched_timer(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+# ifdef CONFIG_HIGH_RES_TIMERS
+ if (ts->sched_timer.base)
+ hrtimer_cancel(&ts->sched_timer);
+# endif
+
+ ts->nohz_mode = NOHZ_MODE_INACTIVE;
+}
+#endif
+
+/**
+ * Async notification about clocksource changes
+ */
+void tick_clock_notify(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
+}
+
+/*
+ * Async notification about clock event changes
+ */
+void tick_oneshot_notify(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ set_bit(0, &ts->check_clocks);
+}
+
+/**
+ * Check, if a change happened, which makes oneshot possible.
+ *
+ * Called cyclic from the hrtimer softirq (driven by the timer
+ * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * mode, because high resolution timers are disabled (either compile
+ * or runtime).
+ */
+int tick_check_oneshot_change(int allow_nohz)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!test_and_clear_bit(0, &ts->check_clocks))
+ return 0;
+
+ if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+ return 0;
+
+ if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
+ return 0;
+
+ if (!allow_nohz)
+ return 1;
+
+ tick_nohz_switch_to_nohz();
+ return 0;
+}
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 00000000..a9ae3699
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/timecompare.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+#include <linux/kernel.h>
+
+/*
+ * fixed point arithmetic scale factor for skew
+ *
+ * Usually one would measure skew in ppb (parts per billion, 1e9), but
+ * using a factor of 2 simplifies the math.
+ */
+#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
+
+ktime_t timecompare_transform(struct timecompare *sync,
+ u64 source_tstamp)
+{
+ u64 nsec;
+
+ nsec = source_tstamp + sync->offset;
+ nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
+ TIMECOMPARE_SKEW_RESOLUTION;
+
+ return ns_to_ktime(nsec);
+}
+EXPORT_SYMBOL_GPL(timecompare_transform);
+
+int timecompare_offset(struct timecompare *sync,
+ s64 *offset,
+ u64 *source_tstamp)
+{
+ u64 start_source = 0, end_source = 0;
+ struct {
+ s64 offset;
+ s64 duration_target;
+ } buffer[10], sample, *samples;
+ int counter = 0, i;
+ int used;
+ int index;
+ int num_samples = sync->num_samples;
+
+ if (num_samples > ARRAY_SIZE(buffer)) {
+ samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
+ if (!samples) {
+ samples = buffer;
+ num_samples = ARRAY_SIZE(buffer);
+ }
+ } else {
+ samples = buffer;
+ }
+
+ /* run until we have enough valid samples, but do not try forever */
+ i = 0;
+ counter = 0;
+ while (1) {
+ u64 ts;
+ ktime_t start, end;
+
+ start = sync->target();
+ ts = timecounter_read(sync->source);
+ end = sync->target();
+
+ if (!i)
+ start_source = ts;
+
+ /* ignore negative durations */
+ sample.duration_target = ktime_to_ns(ktime_sub(end, start));
+ if (sample.duration_target >= 0) {
+ /*
+ * assume symetric delay to and from source:
+ * average target time corresponds to measured
+ * source time
+ */
+ sample.offset =
+ (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
+ ts;
+
+ /* simple insertion sort based on duration */
+ index = counter - 1;
+ while (index >= 0) {
+ if (samples[index].duration_target <
+ sample.duration_target)
+ break;
+ samples[index + 1] = samples[index];
+ index--;
+ }
+ samples[index + 1] = sample;
+ counter++;
+ }
+
+ i++;
+ if (counter >= num_samples || i >= 100000) {
+ end_source = ts;
+ break;
+ }
+ }
+
+ *source_tstamp = (end_source + start_source) / 2;
+
+ /* remove outliers by only using 75% of the samples */
+ used = counter * 3 / 4;
+ if (!used)
+ used = counter;
+ if (used) {
+ /* calculate average */
+ s64 off = 0;
+ for (index = 0; index < used; index++)
+ off += samples[index].offset;
+ *offset = div_s64(off, used);
+ }
+
+ if (samples && samples != buffer)
+ kfree(samples);
+
+ return used;
+}
+EXPORT_SYMBOL_GPL(timecompare_offset);
+
+void __timecompare_update(struct timecompare *sync,
+ u64 source_tstamp)
+{
+ s64 offset;
+ u64 average_time;
+
+ if (!timecompare_offset(sync, &offset, &average_time))
+ return;
+
+ if (!sync->last_update) {
+ sync->last_update = average_time;
+ sync->offset = offset;
+ sync->skew = 0;
+ } else {
+ s64 delta_nsec = average_time - sync->last_update;
+
+ /* avoid division by negative or small deltas */
+ if (delta_nsec >= 10000) {
+ s64 delta_offset_nsec = offset - sync->offset;
+ s64 skew; /* delta_offset_nsec *
+ TIMECOMPARE_SKEW_RESOLUTION /
+ delta_nsec */
+ u64 divisor;
+
+ /* div_s64() is limited to 32 bit divisor */
+ skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
+ divisor = delta_nsec;
+ while (unlikely(divisor >= ((s64)1) << 32)) {
+ /* divide both by 2; beware, right shift
+ of negative value has undefined
+ behavior and can only be used for
+ the positive divisor */
+ skew = div_s64(skew, 2);
+ divisor >>= 1;
+ }
+ skew = div_s64(skew, divisor);
+
+ /*
+ * Calculate new overall skew as 4/16 the
+ * old value and 12/16 the new one. This is
+ * a rather arbitrary tradeoff between
+ * only using the latest measurement (0/16 and
+ * 16/16) and even more weight on past measurements.
+ */
+#define TIMECOMPARE_NEW_SKEW_PER_16 12
+ sync->skew =
+ div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
+ sync->skew +
+ TIMECOMPARE_NEW_SKEW_PER_16 * skew,
+ 16);
+ sync->last_update = average_time;
+ sync->offset = offset;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 00000000..86628e75
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Paul Eggert (eggert@twinsun.com).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the GNU C Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Converts the calendar time to broken-down time representation
+ * Based on code from glibc-2.6
+ *
+ * 2009-7-14:
+ * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+
+/*
+ * Nonzero if YEAR is a leap year (every 4 years,
+ * except every 100th isn't, and every 400th is).
+ */
+static int __isleap(long year)
+{
+ return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
+}
+
+/* do a mathdiv for long type */
+static long math_div(long a, long b)
+{
+ return a / b - (a % b < 0);
+}
+
+/* How many leap years between y1 and y2, y1 must less or equal to y2 */
+static long leaps_between(long y1, long y2)
+{
+ long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+ + math_div(y1 - 1, 400);
+ long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+ + math_div(y2 - 1, 400);
+ return leaps2 - leaps1;
+}
+
+/* How many days come before each month (0-12). */
+static const unsigned short __mon_yday[2][13] = {
+ /* Normal years. */
+ {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+ /* Leap years. */
+ {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define SECS_PER_HOUR (60 * 60)
+#define SECS_PER_DAY (SECS_PER_HOUR * 24)
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ * Coordinated Universal Time (UTC).
+ * @offset offset seconds adding to totalsecs.
+ * @result pointer to struct tm variable to receive broken-down time
+ */
+void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+ long days, rem, y;
+ const unsigned short *ip;
+
+ days = totalsecs / SECS_PER_DAY;
+ rem = totalsecs % SECS_PER_DAY;
+ rem += offset;
+ while (rem < 0) {
+ rem += SECS_PER_DAY;
+ --days;
+ }
+ while (rem >= SECS_PER_DAY) {
+ rem -= SECS_PER_DAY;
+ ++days;
+ }
+
+ result->tm_hour = rem / SECS_PER_HOUR;
+ rem %= SECS_PER_HOUR;
+ result->tm_min = rem / 60;
+ result->tm_sec = rem % 60;
+
+ /* January 1, 1970 was a Thursday. */
+ result->tm_wday = (4 + days) % 7;
+ if (result->tm_wday < 0)
+ result->tm_wday += 7;
+
+ y = 1970;
+
+ while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+ /* Guess a corrected year, assuming 365 days per year. */
+ long yg = y + math_div(days, 365);
+
+ /* Adjust DAYS and Y to match the guessed year. */
+ days -= (yg - y) * 365 + leaps_between(y, yg);
+ y = yg;
+ }
+
+ result->tm_year = y - 1900;
+
+ result->tm_yday = days;
+
+ ip = __mon_yday[__isleap(y)];
+ for (y = 11; days < ip[y]; y--)
+ continue;
+ days -= ip[y];
+
+ result->tm_mon = y;
+ result->tm_mday = days + 1;
+}
+EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 00000000..2f7b1b42
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,1280 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/syscore_ops.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+#include <linux/stop_machine.h>
+
+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+ /* Current clocksource used for timekeeping. */
+ struct clocksource *clock;
+ /* NTP adjusted clock multiplier */
+ u32 mult;
+ /* The shift value of the current clocksource. */
+ int shift;
+
+ /* Number of clock cycles in one NTP interval. */
+ cycle_t cycle_interval;
+ /* Number of clock shifted nano seconds in one NTP interval. */
+ u64 xtime_interval;
+ /* shifted nano seconds left over when rounding cycle_interval */
+ s64 xtime_remainder;
+ /* Raw nano seconds accumulated per NTP interval. */
+ u32 raw_interval;
+
+ /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+ u64 xtime_nsec;
+ /* Difference between accumulated time and NTP time in ntp
+ * shifted nano seconds. */
+ s64 ntp_error;
+ /* Shift conversion between clock shifted nano seconds and
+ * ntp shifted nano seconds. */
+ int ntp_error_shift;
+
+ /* The current time */
+ struct timespec xtime;
+ /*
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time. Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ *
+ * wall_to_monotonic is moved after resume from suspend for the
+ * monotonic time not to jump. We need to add total_sleep_time to
+ * wall_to_monotonic to get the real boot based time offset.
+ *
+ * - wall_to_monotonic is no longer the boot time, getboottime must be
+ * used instead.
+ */
+ struct timespec wall_to_monotonic;
+ /* time spent in suspend */
+ struct timespec total_sleep_time;
+ /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
+ struct timespec raw_time;
+
+ /* Seqlock for all timekeeper values */
+ seqlock_t lock;
+};
+
+static struct timekeeper timekeeper;
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime.
+ */
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
+
+
+/**
+ * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @clock: Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void timekeeper_setup_internals(struct clocksource *clock)
+{
+ cycle_t interval;
+ u64 tmp, ntpinterval;
+
+ timekeeper.clock = clock;
+ clock->cycle_last = clock->read(clock);
+
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = NTP_INTERVAL_LENGTH;
+ tmp <<= clock->shift;
+ ntpinterval = tmp;
+ tmp += clock->mult/2;
+ do_div(tmp, clock->mult);
+ if (tmp == 0)
+ tmp = 1;
+
+ interval = (cycle_t) tmp;
+ timekeeper.cycle_interval = interval;
+
+ /* Go back from cycles -> shifted ns */
+ timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
+ timekeeper.raw_interval =
+ ((u64) interval * clock->mult) >> clock->shift;
+
+ timekeeper.xtime_nsec = 0;
+ timekeeper.shift = clock->shift;
+
+ timekeeper.ntp_error = 0;
+ timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+
+ /*
+ * The timekeeper keeps its own mult values for the currently
+ * active clocksource. These value will be adjusted via NTP
+ * to counteract clock drifting.
+ */
+ timekeeper.mult = clock->mult;
+}
+
+/* Timekeeper helper functions. */
+static inline s64 timekeeping_get_ns(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
+
+ /* read clocksource: */
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* return delta convert to nanoseconds using ntp adjusted mult. */
+ return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+ timekeeper.shift);
+}
+
+static inline s64 timekeeping_get_ns_raw(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
+
+ /* read clocksource: */
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* return delta convert to nanoseconds. */
+ return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+}
+
+/* must hold write on timekeeper.lock */
+static void timekeeping_update(bool clearntp)
+{
+ if (clearntp) {
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+ }
+ update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
+ timekeeper.clock, timekeeper.mult);
+}
+
+
+/**
+ * timekeeping_forward_now - update clock to the current time
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void timekeeping_forward_now(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
+ s64 nsec;
+
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ clock->cycle_last = cycle_now;
+
+ nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+ timekeeper.shift);
+
+ /* If arch requires, add in gettimeoffset() */
+ nsec += arch_gettimeoffset();
+
+ timespec_add_ns(&timekeeper.xtime, nsec);
+
+ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ timespec_add_ns(&timekeeper.raw_time, nsec);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ *ts = timekeeper.xtime;
+ nsecs = timekeeping_get_ns();
+
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+ktime_t ktime_get(void)
+{
+ unsigned int seq;
+ s64 secs, nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ secs = timekeeper.xtime.tv_sec +
+ timekeeper.wall_to_monotonic.tv_sec;
+ nsecs = timekeeper.xtime.tv_nsec +
+ timekeeper.wall_to_monotonic.tv_nsec;
+ nsecs += timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+ /*
+ * Use ktime_set/ktime_add_ns to create a proper ktime on
+ * 32-bit architectures without CONFIG_KTIME_SCALAR.
+ */
+ return ktime_add_ns(ktime_set(secs, 0), nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ struct timespec tomono;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ *ts = timekeeper.xtime;
+ tomono = timekeeper.wall_to_monotonic;
+ nsecs = timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw: pointer to the timespec to be set to raw monotonic time
+ * @ts_real: pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+ unsigned long seq;
+ s64 nsecs_raw, nsecs_real;
+
+ WARN_ON_ONCE(timekeeping_suspended);
+
+ do {
+ u32 arch_offset;
+
+ seq = read_seqbegin(&timekeeper.lock);
+
+ *ts_raw = timekeeper.raw_time;
+ *ts_real = timekeeper.xtime;
+
+ nsecs_raw = timekeeping_get_ns_raw();
+ nsecs_real = timekeeping_get_ns();
+
+ /* If arch requires, add in gettimeoffset() */
+ arch_offset = arch_gettimeoffset();
+ nsecs_raw += arch_offset;
+ nsecs_real += arch_offset;
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ timespec_add_ns(ts_raw, nsecs_raw);
+ timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using getnstimeofday()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(const struct timespec *tv)
+{
+ struct timespec ts_delta;
+ unsigned long flags;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ timekeeping_forward_now();
+
+ ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
+ ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
+ timekeeper.wall_to_monotonic =
+ timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
+
+ timekeeper.xtime = *tv;
+ timekeeping_update(true);
+
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv: pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+ unsigned long flags;
+
+ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ timekeeping_forward_now();
+
+ timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
+ timekeeper.wall_to_monotonic =
+ timespec_sub(timekeeper.wall_to_monotonic, *ts);
+
+ timekeeping_update(true);
+
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static int change_clocksource(void *data)
+{
+ struct clocksource *new, *old;
+ unsigned long flags;
+
+ new = (struct clocksource *) data;
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ timekeeping_forward_now();
+ if (!new->enable || new->enable(new) == 0) {
+ old = timekeeper.clock;
+ timekeeper_setup_internals(new);
+ if (old->disable)
+ old->disable(old);
+ }
+ timekeeping_update(true);
+
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ return 0;
+}
+
+/**
+ * timekeeping_notify - Install a new clock source
+ * @clock: pointer to the clock source
+ *
+ * This function is called from clocksource.c after a new, better clock
+ * source has been registered. The caller holds the clocksource_mutex.
+ */
+void timekeeping_notify(struct clocksource *clock)
+{
+ if (timekeeper.clock == clock)
+ return;
+ stop_machine(change_clocksource, clock, NULL);
+ tick_clock_notify();
+}
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ nsecs = timekeeping_get_ns_raw();
+ *ts = timekeeper.raw_time;
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+
+
+/**
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
+ */
+int timekeeping_valid_for_hres(void)
+{
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ return ret;
+}
+
+/**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ */
+u64 timekeeping_max_deferment(void)
+{
+ unsigned long seq;
+ u64 ret;
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ ret = timekeeper.clock->max_idle_ns;
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Reads the time from the battery backed persistent clock.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
+{
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+}
+
+/**
+ * read_boot_clock - Return time of the system start.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Function to read the exact time the system has been started.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+{
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ struct clocksource *clock;
+ unsigned long flags;
+ struct timespec now, boot;
+
+ read_persistent_clock(&now);
+ read_boot_clock(&boot);
+
+ seqlock_init(&timekeeper.lock);
+
+ ntp_init();
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+ clock = clocksource_default_clock();
+ if (clock->enable)
+ clock->enable(clock);
+ timekeeper_setup_internals(clock);
+
+ timekeeper.xtime.tv_sec = now.tv_sec;
+ timekeeper.xtime.tv_nsec = now.tv_nsec;
+ timekeeper.raw_time.tv_sec = 0;
+ timekeeper.raw_time.tv_nsec = 0;
+ if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+ boot.tv_sec = timekeeper.xtime.tv_sec;
+ boot.tv_nsec = timekeeper.xtime.tv_nsec;
+ }
+ set_normalized_timespec(&timekeeper.wall_to_monotonic,
+ -boot.tv_sec, -boot.tv_nsec);
+ timekeeper.total_sleep_time.tv_sec = 0;
+ timekeeper.total_sleep_time.tv_nsec = 0;
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+}
+
+/* time in seconds when suspend began */
+static struct timespec timekeeping_suspend_time;
+
+/**
+ * __timekeeping_inject_sleeptime - Internal function to add sleep interval
+ * @delta: pointer to a timespec delta value
+ *
+ * Takes a timespec offset measuring a suspend interval and properly
+ * adds the sleep offset to the timekeeping variables.
+ */
+static void __timekeeping_inject_sleeptime(struct timespec *delta)
+{
+ if (!timespec_valid(delta)) {
+ printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
+ "sleep delta value!\n");
+ return;
+ }
+
+ timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
+ timekeeper.wall_to_monotonic =
+ timespec_sub(timekeeper.wall_to_monotonic, *delta);
+ timekeeper.total_sleep_time = timespec_add(
+ timekeeper.total_sleep_time, *delta);
+}
+
+
+/**
+ * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock
+ * because their RTC/persistent clock is only accessible when irqs are enabled.
+ *
+ * This function should only be called by rtc_resume(), and allows
+ * a suspend offset to be injected into the timekeeping values.
+ */
+void timekeeping_inject_sleeptime(struct timespec *delta)
+{
+ unsigned long flags;
+ struct timespec ts;
+
+ /* Make sure we don't set the clock twice */
+ read_persistent_clock(&ts);
+ if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+ return;
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ timekeeping_forward_now();
+
+ __timekeeping_inject_sleeptime(delta);
+
+ timekeeping_update(true);
+
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+}
+
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static void timekeeping_resume(void)
+{
+ unsigned long flags;
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+
+ clocksource_resume();
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
+ ts = timespec_sub(ts, timekeeping_suspend_time);
+ __timekeeping_inject_sleeptime(&ts);
+ }
+ /* re-base the last cycle value */
+ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+ timekeeper.ntp_error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ touch_softlockup_watchdog();
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+ /* Resume hrtimers */
+ hrtimers_resume();
+}
+
+static int timekeeping_suspend(void)
+{
+ unsigned long flags;
+ struct timespec delta, delta_delta;
+ static struct timespec old_delta;
+
+ read_persistent_clock(&timekeeping_suspend_time);
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+ timekeeping_forward_now();
+ timekeeping_suspended = 1;
+
+ /*
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
+ */
+ delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
+ delta_delta = timespec_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occured and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec_add(timekeeping_suspend_time, delta_delta);
+ }
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ clocksource_suspend();
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+};
+
+static int __init timekeeping_init_ops(void)
+{
+ register_syscore_ops(&timekeeping_syscore_ops);
+ return 0;
+}
+
+device_initcall(timekeeping_init_ops);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adjusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
+ tick_error -= timekeeper.xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void timekeeping_adjust(s64 offset)
+{
+ s64 error, interval = timekeeper.cycle_interval;
+ int adj;
+
+ /*
+ * The point of this is to check if the error is greater than half
+ * an interval.
+ *
+ * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
+ *
+ * Note we subtract one in the shift, so that error is really error*2.
+ * This "saves" dividing(shifting) interval twice, but keeps the
+ * (error > interval) comparison as still measuring if error is
+ * larger than half an interval.
+ *
+ * Note: It does not "save" on aggravation when reading the code.
+ */
+ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
+ if (error > interval) {
+ /*
+ * We now divide error by 4(via shift), which checks if
+ * the error is greater than twice the interval.
+ * If it is greater, we need a bigadjust, if its smaller,
+ * we can adjust by 1.
+ */
+ error >>= 2;
+ /*
+ * XXX - In update_wall_time, we round up to the next
+ * nanosecond, and store the amount rounded up into
+ * the error. This causes the likely below to be unlikely.
+ *
+ * The proper fix is to avoid rounding up by using
+ * the high precision timekeeper.xtime_nsec instead of
+ * xtime.tv_nsec everywhere. Fixing this will take some
+ * time.
+ */
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = timekeeping_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ /* See comment above, this is just switched for the negative */
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = timekeeping_bigadjust(error, &interval, &offset);
+ } else /* No adjustment needed */
+ return;
+
+ if (unlikely(timekeeper.clock->maxadj &&
+ (timekeeper.mult + adj >
+ timekeeper.clock->mult + timekeeper.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ timekeeper.clock->name, (long)timekeeper.mult + adj,
+ (long)timekeeper.clock->mult +
+ timekeeper.clock->maxadj);
+ }
+ /*
+ * So the following can be confusing.
+ *
+ * To keep things simple, lets assume adj == 1 for now.
+ *
+ * When adj != 1, remember that the interval and offset values
+ * have been appropriately scaled so the math is the same.
+ *
+ * The basic idea here is that we're increasing the multiplier
+ * by one, this causes the xtime_interval to be incremented by
+ * one cycle_interval. This is because:
+ * xtime_interval = cycle_interval * mult
+ * So if mult is being incremented by one:
+ * xtime_interval = cycle_interval * (mult + 1)
+ * Its the same as:
+ * xtime_interval = (cycle_interval * mult) + cycle_interval
+ * Which can be shortened to:
+ * xtime_interval += cycle_interval
+ *
+ * So offset stores the non-accumulated cycles. Thus the current
+ * time (in shifted nanoseconds) is:
+ * now = (offset * adj) + xtime_nsec
+ * Now, even though we're adjusting the clock frequency, we have
+ * to keep time consistent. In other words, we can't jump back
+ * in time, and we also want to avoid jumping forward in time.
+ *
+ * So given the same offset value, we need the time to be the same
+ * both before and after the freq adjustment.
+ * now = (offset * adj_1) + xtime_nsec_1
+ * now = (offset * adj_2) + xtime_nsec_2
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_2) + xtime_nsec_2
+ * And we know:
+ * adj_2 = adj_1 + 1
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * (adj_1+1)) + xtime_nsec_2
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_1) + offset + xtime_nsec_2
+ * Canceling the sides:
+ * xtime_nsec_1 = offset + xtime_nsec_2
+ * Which gives us:
+ * xtime_nsec_2 = xtime_nsec_1 - offset
+ * Which simplfies to:
+ * xtime_nsec -= offset
+ *
+ * XXX - TODO: Doc ntp_error calculation.
+ */
+ timekeeper.mult += adj;
+ timekeeper.xtime_interval += interval;
+ timekeeper.xtime_nsec -= offset;
+ timekeeper.ntp_error -= (interval - offset) <<
+ timekeeper.ntp_error_shift;
+}
+
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+ u64 raw_nsecs;
+
+ /* If the offset is smaller than a shifted interval, do nothing */
+ if (offset < timekeeper.cycle_interval<<shift)
+ return offset;
+
+ /* Accumulate one shifted interval */
+ offset -= timekeeper.cycle_interval << shift;
+ timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+
+ timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+ while (timekeeper.xtime_nsec >= nsecps) {
+ int leap;
+ timekeeper.xtime_nsec -= nsecps;
+ timekeeper.xtime.tv_sec++;
+ leap = second_overflow(timekeeper.xtime.tv_sec);
+ timekeeper.xtime.tv_sec += leap;
+ timekeeper.wall_to_monotonic.tv_sec -= leap;
+ }
+
+ /* Accumulate raw time */
+ raw_nsecs = timekeeper.raw_interval << shift;
+ raw_nsecs += timekeeper.raw_time.tv_nsec;
+ if (raw_nsecs >= NSEC_PER_SEC) {
+ u64 raw_secs = raw_nsecs;
+ raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+ timekeeper.raw_time.tv_sec += raw_secs;
+ }
+ timekeeper.raw_time.tv_nsec = raw_nsecs;
+
+ /* Accumulate error between NTP and clock interval */
+ timekeeper.ntp_error += ntp_tick_length() << shift;
+ timekeeper.ntp_error -=
+ (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
+ (timekeeper.ntp_error_shift + shift);
+
+ return offset;
+}
+
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ */
+static void update_wall_time(void)
+{
+ struct clocksource *clock;
+ cycle_t offset;
+ int shift = 0, maxshift;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ goto out;
+
+ clock = timekeeper.clock;
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+ offset = timekeeper.cycle_interval;
+#else
+ offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+#endif
+ timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
+ timekeeper.shift;
+
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller than the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
+ */
+ shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ shift = min(shift, maxshift);
+ while (offset >= timekeeper.cycle_interval) {
+ offset = logarithmic_accumulation(offset, shift);
+ if(offset < timekeeper.cycle_interval<<shift)
+ shift--;
+ }
+
+ /* correct the clock when NTP error is too big */
+ timekeeping_adjust(offset);
+
+ /*
+ * Since in the loop above, we accumulate any amount of time
+ * in xtime_nsec over a second into xtime.tv_sec, its possible for
+ * xtime_nsec to be fairly small after the loop. Further, if we're
+ * slightly speeding the clocksource up in timekeeping_adjust(),
+ * its possible the required corrective factor to xtime_nsec could
+ * cause it to underflow.
+ *
+ * Now, we cannot simply roll the accumulated second back, since
+ * the NTP subsystem has been notified via second_overflow. So
+ * instead we push xtime_nsec forward by the amount we underflowed,
+ * and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+ s64 neg = -(s64)timekeeper.xtime_nsec;
+ timekeeper.xtime_nsec = 0;
+ timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
+ }
+
+
+ /*
+ * Store full nanoseconds into xtime after rounding it up and
+ * add the remainder to the error difference.
+ */
+ timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
+ timekeeper.shift) + 1;
+ timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
+ timekeeper.shift;
+ timekeeper.ntp_error += timekeeper.xtime_nsec <<
+ timekeeper.ntp_error_shift;
+
+ /*
+ * Finally, make sure that after the rounding
+ * xtime.tv_nsec isn't larger than NSEC_PER_SEC
+ */
+ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
+ int leap;
+ timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
+ timekeeper.xtime.tv_sec++;
+ leap = second_overflow(timekeeper.xtime.tv_sec);
+ timekeeper.xtime.tv_sec += leap;
+ timekeeper.wall_to_monotonic.tv_sec -= leap;
+ }
+
+ timekeeping_update(false);
+
+out:
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+}
+
+/**
+ * getboottime - Return the real time of system boot.
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the wall-time of boot in a timespec.
+ *
+ * This is based on the wall_to_monotonic offset and the total suspend
+ * time. Calls to settimeofday will affect the value returned (which
+ * basically means that however wrong your real time clock is at boot time,
+ * you get the right time here).
+ */
+void getboottime(struct timespec *ts)
+{
+ struct timespec boottime = {
+ .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
+ timekeeper.total_sleep_time.tv_sec,
+ .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
+ timekeeper.total_sleep_time.tv_nsec
+ };
+
+ set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
+}
+EXPORT_SYMBOL_GPL(getboottime);
+
+
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+ struct timespec tomono, sleep;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ *ts = timekeeper.xtime;
+ tomono = timekeeper.wall_to_monotonic;
+ sleep = timekeeper.total_sleep_time;
+ nsecs = timekeeping_get_ns();
+
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+ (s64)ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+ struct timespec ts;
+
+ get_monotonic_boottime(&ts);
+ return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
+
+/**
+ * monotonic_to_bootbased - Convert the monotonic time to boot based.
+ * @ts: pointer to the timespec to be converted
+ */
+void monotonic_to_bootbased(struct timespec *ts)
+{
+ *ts = timespec_add(*ts, timekeeper.total_sleep_time);
+}
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
+
+unsigned long get_seconds(void)
+{
+ return timekeeper.xtime.tv_sec;
+}
+EXPORT_SYMBOL(get_seconds);
+
+struct timespec __current_kernel_time(void)
+{
+ return timekeeper.xtime;
+}
+
+struct timespec current_kernel_time(void)
+{
+ struct timespec now;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ now = timekeeper.xtime;
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ return now;
+}
+EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+ struct timespec now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ now = timekeeper.xtime;
+ mono = timekeeper.wall_to_monotonic;
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+ return now;
+}
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+ jiffies_64 += ticks;
+ update_wall_time();
+ calc_global_load(ticks);
+}
+
+/**
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ * and sleep offsets.
+ * @xtim: pointer to timespec to be set with xtime
+ * @wtom: pointer to timespec to be set with wall_to_monotonic
+ * @sleep: pointer to timespec to be set with time in suspend
+ */
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+ struct timespec *wtom, struct timespec *sleep)
+{
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ *xtim = timekeeper.xtime;
+ *wtom = timekeeper.wall_to_monotonic;
+ *sleep = timekeeper.total_sleep_time;
+ } while (read_seqretry(&timekeeper.lock, seq));
+}
+
+/**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+ unsigned long seq;
+ struct timespec wtom;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+ wtom = timekeeper.wall_to_monotonic;
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ return timespec_to_ktime(wtom);
+}
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+ write_seqlock(&xtime_lock);
+ do_timer(ticks);
+ write_sequnlock(&xtime_lock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 00000000..32584555
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,301 @@
+/*
+ * kernel/time/timer_list.c
+ *
+ * List pending timers
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/tick.h>
+
+#include <asm/uaccess.h>
+
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+/*
+ * This allows printing both to /proc/timer_list and
+ * to the console (on SysRq-Q):
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+static void print_name_offset(struct seq_file *m, void *sym)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
+ SEQ_printf(m, "<%pK>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
+}
+
+static void
+print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+ int idx, u64 now)
+{
+#ifdef CONFIG_TIMER_STATS
+ char tmp[TASK_COMM_LEN + 1];
+#endif
+ SEQ_printf(m, " #%d: ", idx);
+ print_name_offset(m, taddr);
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->function);
+ SEQ_printf(m, ", S:%02lx", timer->state);
+#ifdef CONFIG_TIMER_STATS
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->start_site);
+ memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
+ tmp[TASK_COMM_LEN] = 0;
+ SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
+#endif
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+ (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+ (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+ (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
+}
+
+static void
+print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
+ u64 now)
+{
+ struct hrtimer *timer, tmp;
+ unsigned long next = 0, i;
+ struct timerqueue_node *curr;
+ unsigned long flags;
+
+next_one:
+ i = 0;
+ raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
+
+ curr = timerqueue_getnext(&base->active);
+ /*
+ * Crude but we have to do this O(N*N) thing, because
+ * we have to unlock the base when printing:
+ */
+ while (curr && i < next) {
+ curr = timerqueue_iterate_next(curr);
+ i++;
+ }
+
+ if (curr) {
+
+ timer = container_of(curr, struct hrtimer, node);
+ tmp = *timer;
+ raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+
+ print_timer(m, timer, &tmp, i, now);
+ next++;
+ goto next_one;
+ }
+ raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+}
+
+static void
+print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
+{
+ SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .index: %d\n",
+ base->index);
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
+ (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .get_time: ");
+ print_name_offset(m, base->get_time);
+ SEQ_printf(m, "\n");
+#ifdef CONFIG_HIGH_RES_TIMERS
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
+#endif
+ SEQ_printf(m, "active timers:\n");
+ print_active_timers(m, base, now);
+}
+
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cpu: %d\n", cpu);
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ SEQ_printf(m, " clock %d:\n", i);
+ print_base(m, cpu_base->clock_base + i, now);
+ }
+#define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
+#define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ P_ns(expires_next);
+ P(hres_active);
+ P(nr_events);
+ P(nr_retries);
+ P(nr_hangs);
+ P_ns(max_hang_time);
+#endif
+#undef P
+#undef P_ns
+
+#ifdef CONFIG_TICK_ONESHOT
+# define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
+# define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
+ {
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+ P(nohz_mode);
+ P_ns(idle_tick);
+ P(tick_stopped);
+ P(idle_jiffies);
+ P(idle_calls);
+ P(idle_sleeps);
+ P_ns(idle_entrytime);
+ P_ns(idle_waketime);
+ P_ns(idle_exittime);
+ P_ns(idle_sleeptime);
+ P_ns(iowait_sleeptime);
+ P(last_jiffies);
+ P(next_jiffies);
+ P_ns(idle_expires);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
+ }
+#endif
+
+#undef P
+#undef P_ns
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+static void
+print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
+{
+ struct clock_event_device *dev = td->evtdev;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
+ if (cpu < 0)
+ SEQ_printf(m, "Broadcast device\n");
+ else
+ SEQ_printf(m, "Per CPU device: %d\n", cpu);
+
+ SEQ_printf(m, "Clock Event Device: ");
+ if (!dev) {
+ SEQ_printf(m, "<NULL>\n");
+ return;
+ }
+ SEQ_printf(m, "%s\n", dev->name);
+ SEQ_printf(m, " max_delta_ns: %llu\n",
+ (unsigned long long) dev->max_delta_ns);
+ SEQ_printf(m, " min_delta_ns: %llu\n",
+ (unsigned long long) dev->min_delta_ns);
+ SEQ_printf(m, " mult: %u\n", dev->mult);
+ SEQ_printf(m, " shift: %u\n", dev->shift);
+ SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " next_event: %Ld nsecs\n",
+ (unsigned long long) ktime_to_ns(dev->next_event));
+
+ SEQ_printf(m, " set_next_event: ");
+ print_name_offset(m, dev->set_next_event);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " event_handler: ");
+ print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " retries: %lu\n", dev->retries);
+}
+
+static void timer_list_show_tickdevices(struct seq_file *m)
+{
+ int cpu;
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ print_tickdevice(m, tick_get_broadcast_device(), -1);
+ SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
+ cpumask_bits(tick_get_broadcast_mask())[0]);
+#ifdef CONFIG_TICK_ONESHOT
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+ cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+#endif
+ SEQ_printf(m, "\n");
+#endif
+ for_each_online_cpu(cpu)
+ print_tickdevice(m, tick_get_device(cpu), cpu);
+ SEQ_printf(m, "\n");
+}
+#else
+static void timer_list_show_tickdevices(struct seq_file *m) { }
+#endif
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Timer List Version: v0.6\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu, now);
+
+ SEQ_printf(m, "\n");
+ timer_list_show_tickdevices(m);
+
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ timer_list_show(NULL, NULL);
+}
+
+static int timer_list_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timer_list_show, NULL);
+}
+
+static const struct file_operations timer_list_fops = {
+ .open = timer_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_timer_list_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 00000000..0b537f27
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,425 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ * Written by Daniel Petrini <d.pensator@gmail.com>
+ * timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo [1|0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+ /*
+ * Hash list:
+ */
+ struct entry *next;
+
+ /*
+ * Hash keys:
+ */
+ void *timer;
+ void *start_func;
+ void *expire_func;
+ pid_t pid;
+
+ /*
+ * Number of timeout events:
+ */
+ unsigned long count;
+ unsigned int timer_flag;
+
+ /*
+ * We save the command-line string to preserve
+ * this information past task exit:
+ */
+ char comm[TASK_COMM_LEN + 1];
+
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_RAW_SPINLOCK(table_lock);
+
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
+
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+
+/*
+ * Collection status, active/inactive:
+ */
+int __read_mostly timer_stats_active;
+
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS 10
+#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
+
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+
+static atomic_t overflow_count;
+
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
+
+#define __tstat_hashfn(entry) \
+ (((unsigned long)(entry)->timer ^ \
+ (unsigned long)(entry)->start_func ^ \
+ (unsigned long)(entry)->expire_func ^ \
+ (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
+
+#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
+
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+
+static void reset_entries(void)
+{
+ nr_entries = 0;
+ memset(entries, 0, sizeof(entries));
+ memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
+ atomic_set(&overflow_count, 0);
+}
+
+static struct entry *alloc_entry(void)
+{
+ if (nr_entries >= MAX_ENTRIES)
+ return NULL;
+
+ return entries + nr_entries++;
+}
+
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+ return entry1->timer == entry2->timer &&
+ entry1->start_func == entry2->start_func &&
+ entry1->expire_func == entry2->expire_func &&
+ entry1->pid == entry2->pid;
+}
+
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+ struct entry **head, *curr, *prev;
+
+ head = tstat_hashentry(entry);
+ curr = *head;
+
+ /*
+ * The fastpath is when the entry is already hashed,
+ * we do this with the lookup lock held, but with the
+ * table lock not held:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ return curr;
+
+ curr = curr->next;
+ }
+ /*
+ * Slowpath: allocate, set up and link a new hash entry:
+ */
+ prev = NULL;
+ curr = *head;
+
+ raw_spin_lock(&table_lock);
+ /*
+ * Make sure we have not raced with another CPU:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ goto out_unlock;
+
+ prev = curr;
+ curr = curr->next;
+ }
+
+ curr = alloc_entry();
+ if (curr) {
+ *curr = *entry;
+ curr->count = 0;
+ curr->next = NULL;
+ memcpy(curr->comm, comm, TASK_COMM_LEN);
+
+ smp_mb(); /* Ensure that curr is initialized before insert */
+
+ if (prev)
+ prev->next = curr;
+ else
+ *head = curr;
+ }
+ out_unlock:
+ raw_spin_unlock(&table_lock);
+
+ return curr;
+}
+
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer: pointer to either a timer_list or a hrtimer
+ * @pid: the pid of the task which set up the timer
+ * @startf: pointer to the function which did the timer setup
+ * @timerf: pointer to the timer callback function of the timer
+ * @comm: name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+ void *timerf, char *comm,
+ unsigned int timer_flag)
+{
+ /*
+ * It doesn't matter which lock we take:
+ */
+ raw_spinlock_t *lock;
+ struct entry *entry, input;
+ unsigned long flags;
+
+ if (likely(!timer_stats_active))
+ return;
+
+ lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
+
+ input.timer = timer;
+ input.start_func = startf;
+ input.expire_func = timerf;
+ input.pid = pid;
+ input.timer_flag = timer_flag;
+
+ raw_spin_lock_irqsave(lock, flags);
+ if (!timer_stats_active)
+ goto out_unlock;
+
+ entry = tstat_lookup(&input, comm);
+ if (likely(entry))
+ entry->count++;
+ else
+ atomic_inc(&overflow_count);
+
+ out_unlock:
+ raw_spin_unlock_irqrestore(lock, flags);
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symname) < 0)
+ seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+ struct timespec period;
+ struct entry *entry;
+ unsigned long ms;
+ long events = 0;
+ ktime_t time;
+ int i;
+
+ mutex_lock(&show_mutex);
+ /*
+ * If still active then calculate up to now:
+ */
+ if (timer_stats_active)
+ time_stop = ktime_get();
+
+ time = ktime_sub(time_stop, time_start);
+
+ period = ktime_to_timespec(time);
+ ms = period.tv_nsec / 1000000;
+
+ seq_puts(m, "Timer Stats Version: v0.2\n");
+ seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ if (atomic_read(&overflow_count))
+ seq_printf(m, "Overflow: %d entries\n",
+ atomic_read(&overflow_count));
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = entries + i;
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ seq_printf(m, "%4luD, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ } else {
+ seq_printf(m, " %4lu, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ }
+
+ print_name_offset(m, (unsigned long)entry->start_func);
+ seq_puts(m, " (");
+ print_name_offset(m, (unsigned long)entry->expire_func);
+ seq_puts(m, ")\n");
+
+ events += entry->count;
+ }
+
+ ms += period.tv_sec * 1000;
+ if (!ms)
+ ms = 1;
+
+ if (events && period.tv_sec)
+ seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
+ events, events * 1000 / ms,
+ (events * 1000000 / ms) % 1000);
+ else
+ seq_printf(m, "%ld total events\n", events);
+
+ mutex_unlock(&show_mutex);
+
+ return 0;
+}
+
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+ unsigned long flags;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
+
+ raw_spin_lock_irqsave(lock, flags);
+ /* nothing */
+ raw_spin_unlock_irqrestore(lock, flags);
+ }
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ char ctl[2];
+
+ if (count != 2 || *offs)
+ return -EINVAL;
+
+ if (copy_from_user(ctl, buf, count))
+ return -EFAULT;
+
+ mutex_lock(&show_mutex);
+ switch (ctl[0]) {
+ case '0':
+ if (timer_stats_active) {
+ timer_stats_active = 0;
+ time_stop = ktime_get();
+ sync_access();
+ }
+ break;
+ case '1':
+ if (!timer_stats_active) {
+ reset_entries();
+ time_start = ktime_get();
+ smp_mb();
+ timer_stats_active = 1;
+ }
+ break;
+ default:
+ count = -EINVAL;
+ }
+ mutex_unlock(&show_mutex);
+
+ return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, tstats_show, NULL);
+}
+
+static const struct file_operations tstats_fops = {
+ .open = tstats_open,
+ .read = seq_read,
+ .write = tstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void __init init_timer_stats(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
+}
+
+static int __init init_tstats_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_tstats_procfs);