Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* linux/kernel/timer.c
	3	*
	4	* Kernel internal timers, kernel timekeeping, basic process system calls
	5	*
	6	* Copyright (C) 1991, 1992 Linus Torvalds
	7	*
	8	* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
	9	*
	10	* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
	11	* "A Kernel Model for Precision Timekeeping" by Dave Mills
	12	* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
	13	* serialize accesses to xtime/lost_ticks).
	14	* Copyright (C) 1998 Andrea Arcangeli
	15	* 1999-03-10 Improved NTP compatibility by Ulrich Windl
	16	* 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
	17	* 2000-10-05 Implemented scalable SMP per-CPU timer handling.
	18	* Copyright (C) 2000, 2001, 2002 Ingo Molnar
	19	* Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
	20	*/
	21
	22	#include <linux/kernel_stat.h>
	23	#include <linux/module.h>
	24	#include <linux/interrupt.h>
	25	#include <linux/percpu.h>
	26	#include <linux/init.h>
	27	#include <linux/mm.h>
	28	#include <linux/swap.h>
	29	#include <linux/notifier.h>
	30	#include <linux/thread_info.h>
	31	#include <linux/time.h>
	32	#include <linux/jiffies.h>
	33	#include <linux/posix-timers.h>
	34	#include <linux/cpu.h>
	35	#include <linux/syscalls.h>
	36	#include <linux/delay.h>
	37
	38	#include <asm/uaccess.h>
	39	#include <asm/unistd.h>
	40	#include <asm/div64.h>
	41	#include <asm/timex.h>
	42	#include <asm/io.h>
	43
	44	#ifdef CONFIG_TIME_INTERPOLATION
	45	static void time_interpolator_update(long delta_nsec);
	46	#else
	47	#define time_interpolator_update(x)
	48	#endif
	49
	50	u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
	51
	52	EXPORT_SYMBOL(jiffies_64);
	53
	54	/*
	55	* per-CPU timer vector definitions:
	56	*/
	57
	58	#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
	59	#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
	60	#define TVN_SIZE (1 << TVN_BITS)
	61	#define TVR_SIZE (1 << TVR_BITS)
	62	#define TVN_MASK (TVN_SIZE - 1)
	63	#define TVR_MASK (TVR_SIZE - 1)
	64
	65	struct timer_base_s {
	66	spinlock_t lock;
	67	struct timer_list *running_timer;
	68	};
	69
	70	typedef struct tvec_s {
	71	struct list_head vec[TVN_SIZE];
	72	} tvec_t;
	73
	74	typedef struct tvec_root_s {
	75	struct list_head vec[TVR_SIZE];
	76	} tvec_root_t;
	77
	78	struct tvec_t_base_s {
	79	struct timer_base_s t_base;
	80	unsigned long timer_jiffies;
	81	tvec_root_t tv1;
	82	tvec_t tv2;
	83	tvec_t tv3;
	84	tvec_t tv4;
	85	tvec_t tv5;
	86	} ____cacheline_aligned_in_smp;
	87
	88	typedef struct tvec_t_base_s tvec_base_t;
	89	static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
	90
	91	static inline void set_running_timer(tvec_base_t *base,
	92	struct timer_list *timer)
	93	{
	94	#ifdef CONFIG_SMP
	95	base->t_base.running_timer = timer;
	96	#endif
	97	}
	98
	99	static void internal_add_timer(tvec_base_t base, struct timer_list timer)
	100	{
	101	unsigned long expires = timer->expires;
	102	unsigned long idx = expires - base->timer_jiffies;
	103	struct list_head *vec;
	104
	105	if (idx < TVR_SIZE) {
	106	int i = expires & TVR_MASK;
	107	vec = base->tv1.vec + i;
	108	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
	109	int i = (expires >> TVR_BITS) & TVN_MASK;
	110	vec = base->tv2.vec + i;
	111	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
	112	int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
	113	vec = base->tv3.vec + i;
	114	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
	115	int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
	116	vec = base->tv4.vec + i;
	117	} else if ((signed long) idx < 0) {
	118	/*
	119	* Can happen if you add a timer with expires == jiffies,
	120	* or you set a timer to go off in the past
	121	*/
	122	vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
	123	} else {
	124	int i;
	125	/* If the timeout is larger than 0xffffffff on 64-bit
	126	* architectures then we use the maximum timeout:
	127	*/
	128	if (idx > 0xffffffffUL) {
	129	idx = 0xffffffffUL;
	130	expires = idx + base->timer_jiffies;
	131	}
	132	i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
	133	vec = base->tv5.vec + i;
	134	}
	135	/*
	136	* Timers are FIFO:
	137	*/
	138	list_add_tail(&timer->entry, vec);
	139	}
	140
	141	typedef struct timer_base_s timer_base_t;
	142	/*
	143	* Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
	144	* at compile time, and we need timer->base to lock the timer.
	145	*/
	146	timer_base_t __init_timer_base
	147	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
	148	EXPORT_SYMBOL(__init_timer_base);
	149
	150	/***
	151	* init_timer - initialize a timer.
	152	* @timer: the timer to be initialized
	153	*
	154	* init_timer() must be done to a timer prior calling any of the
	155	* other timer functions.
	156	*/
	157	void fastcall init_timer(struct timer_list *timer)
	158	{
	159	timer->entry.next = NULL;
	160	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
	161	}
	162	EXPORT_SYMBOL(init_timer);
	163
	164	static inline void detach_timer(struct timer_list *timer,
	165	int clear_pending)
	166	{
	167	struct list_head *entry = &timer->entry;
	168
	169	__list_del(entry->prev, entry->next);
	170	if (clear_pending)
	171	entry->next = NULL;
	172	entry->prev = LIST_POISON2;
	173	}
	174
	175	/*
	176	* We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
	177	* means that all timers which are tied to this base via timer->base are
	178	* locked, and the base itself is locked too.
	179	*
	180	* So __run_timers/migrate_timers can safely modify all timers which could
	181	* be found on ->tvX lists.
	182	*
	183	* When the timer's base is locked, and the timer removed from list, it is
	184	* possible to set timer->base = NULL and drop the lock: the timer remains
	185	* locked.
	186	*/
	187	static timer_base_t lock_timer_base(struct timer_list timer,
	188	unsigned long *flags)
	189	{
	190	timer_base_t *base;
	191
	192	for (;;) {
	193	base = timer->base;
	194	if (likely(base != NULL)) {
	195	spin_lock_irqsave(&base->lock, *flags);
	196	if (likely(base == timer->base))
	197	return base;
	198	/* The timer has migrated to another CPU */
	199	spin_unlock_irqrestore(&base->lock, *flags);
	200	}
	201	cpu_relax();
	202	}
	203	}
	204
	205	int __mod_timer(struct timer_list *timer, unsigned long expires)
	206	{
	207	timer_base_t *base;
	208	tvec_base_t *new_base;
	209	unsigned long flags;
	210	int ret = 0;
	211
	212	BUG_ON(!timer->function);
	213
	214	base = lock_timer_base(timer, &flags);
	215
	216	if (timer_pending(timer)) {
	217	detach_timer(timer, 0);
	218	ret = 1;
	219	}
	220
	221	new_base = &__get_cpu_var(tvec_bases);
	222
	223	if (base != &new_base->t_base) {
	224	/*
	225	* We are trying to schedule the timer on the local CPU.
	226	* However we can't change timer's base while it is running,
	227	* otherwise del_timer_sync() can't detect that the timer's
	228	* handler yet has not finished. This also guarantees that
	229	* the timer is serialized wrt itself.
	230	*/
	231	if (unlikely(base->running_timer == timer)) {
	232	/* The timer remains on a former base */
	233	new_base = container_of(base, tvec_base_t, t_base);
	234	} else {
	235	/* See the comment in lock_timer_base() */
	236	timer->base = NULL;
	237	spin_unlock(&base->lock);
	238	spin_lock(&new_base->t_base.lock);
	239	timer->base = &new_base->t_base;
	240	}
	241	}
	242
	243	timer->expires = expires;
	244	internal_add_timer(new_base, timer);
	245	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
	246
	247	return ret;
	248	}
	249
	250	EXPORT_SYMBOL(__mod_timer);
	251
	252	/***
	253	* add_timer_on - start a timer on a particular CPU
	254	* @timer: the timer to be added
	255	* @cpu: the CPU to start it on
	256	*
	257	* This is not very scalable on SMP. Double adds are not possible.
	258	*/
	259	void add_timer_on(struct timer_list *timer, int cpu)
	260	{
	261	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
	262	unsigned long flags;
	263
	264	BUG_ON(timer_pending(timer) \|\| !timer->function);
	265	spin_lock_irqsave(&base->t_base.lock, flags);
	266	timer->base = &base->t_base;
	267	internal_add_timer(base, timer);
	268	spin_unlock_irqrestore(&base->t_base.lock, flags);
	269	}
	270
	271
	272	/***
	273	* mod_timer - modify a timer's timeout
	274	* @timer: the timer to be modified
	275	*
	276	* mod_timer is a more efficient way to update the expire field of an
	277	* active timer (if the timer is inactive it will be activated)
	278	*
	279	* mod_timer(timer, expires) is equivalent to:
	280	*
	281	* del_timer(timer); timer->expires = expires; add_timer(timer);
	282	*
	283	* Note that if there are multiple unserialized concurrent users of the
	284	* same timer, then mod_timer() is the only safe way to modify the timeout,
	285	* since add_timer() cannot modify an already running timer.
	286	*
	287	* The function returns whether it has modified a pending timer or not.
	288	* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
	289	* active timer returns 1.)
	290	*/
	291	int mod_timer(struct timer_list *timer, unsigned long expires)
	292	{
	293	BUG_ON(!timer->function);
	294
	295	/*
	296	* This is a common optimization triggered by the
	297	* networking code - if the timer is re-modified
	298	* to be the same thing then just return:
	299	*/
	300	if (timer->expires == expires && timer_pending(timer))
	301	return 1;
	302
	303	return __mod_timer(timer, expires);
	304	}
	305
	306	EXPORT_SYMBOL(mod_timer);
	307
	308	/***
	309	* del_timer - deactive a timer.
	310	* @timer: the timer to be deactivated
	311	*
	312	* del_timer() deactivates a timer - this works on both active and inactive
	313	* timers.
	314	*
	315	* The function returns whether it has deactivated a pending timer or not.
	316	* (ie. del_timer() of an inactive timer returns 0, del_timer() of an
	317	* active timer returns 1.)
	318	*/
	319	int del_timer(struct timer_list *timer)
	320	{
	321	timer_base_t *base;
	322	unsigned long flags;
	323	int ret = 0;
	324
	325	if (timer_pending(timer)) {
	326	base = lock_timer_base(timer, &flags);
	327	if (timer_pending(timer)) {
	328	detach_timer(timer, 1);
	329	ret = 1;
	330	}
	331	spin_unlock_irqrestore(&base->lock, flags);
	332	}
	333
	334	return ret;
	335	}
	336
	337	EXPORT_SYMBOL(del_timer);
	338
	339	#ifdef CONFIG_SMP
	340	/*
	341	* This function tries to deactivate a timer. Upon successful (ret >= 0)
	342	* exit the timer is not queued and the handler is not running on any CPU.
	343	*
	344	* It must not be called from interrupt contexts.
	345	*/
	346	int try_to_del_timer_sync(struct timer_list *timer)
	347	{
	348	timer_base_t *base;
	349	unsigned long flags;
	350	int ret = -1;
	351
	352	base = lock_timer_base(timer, &flags);
	353
	354	if (base->running_timer == timer)
	355	goto out;
	356
	357	ret = 0;
	358	if (timer_pending(timer)) {
	359	detach_timer(timer, 1);
	360	ret = 1;
	361	}
	362	out:
	363	spin_unlock_irqrestore(&base->lock, flags);
	364
	365	return ret;
	366	}
	367
	368	/***
	369	* del_timer_sync - deactivate a timer and wait for the handler to finish.
	370	* @timer: the timer to be deactivated
	371	*
	372	* This function only differs from del_timer() on SMP: besides deactivating
	373	* the timer it also makes sure the handler has finished executing on other
	374	* CPUs.
	375	*
	376	* Synchronization rules: callers must prevent restarting of the timer,
	377	* otherwise this function is meaningless. It must not be called from
	378	* interrupt contexts. The caller must not hold locks which would prevent
	379	* completion of the timer's handler. The timer's handler must not call
	380	* add_timer_on(). Upon exit the timer is not queued and the handler is
	381	* not running on any CPU.
	382	*
	383	* The function returns whether it has deactivated a pending timer or not.
	384	*/
	385	int del_timer_sync(struct timer_list *timer)
	386	{
	387	for (;;) {
	388	int ret = try_to_del_timer_sync(timer);
	389	if (ret >= 0)
	390	return ret;
	391	}
	392	}
	393
	394	EXPORT_SYMBOL(del_timer_sync);
	395	#endif
	396
	397	static int cascade(tvec_base_t base, tvec_t tv, int index)
	398	{
	399	/* cascade all the timers from tv up one level */
	400	struct list_head head, curr;
	401
	402	head = tv->vec + index;
	403	curr = head->next;
	404	/*
	405	* We are removing _all_ timers from the list, so we don't have to
	406	* detach them individually, just clear the list afterwards.
	407	*/
	408	while (curr != head) {
	409	struct timer_list *tmp;
	410
	411	tmp = list_entry(curr, struct timer_list, entry);
	412	BUG_ON(tmp->base != &base->t_base);
	413	curr = curr->next;
	414	internal_add_timer(base, tmp);
	415	}
	416	INIT_LIST_HEAD(head);
	417
	418	return index;
	419	}
	420
	421	/***
	422	* __run_timers - run all expired timers (if any) on this CPU.
	423	* @base: the timer vector to be processed.
	424	*
	425	* This function cascades all vectors and executes all expired timer
	426	* vectors.
	427	*/
	428	#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
	429
	430	static inline void __run_timers(tvec_base_t *base)
	431	{
	432	struct timer_list *timer;
	433
	434	spin_lock_irq(&base->t_base.lock);
	435	while (time_after_eq(jiffies, base->timer_jiffies)) {
	436	struct list_head work_list = LIST_HEAD_INIT(work_list);
	437	struct list_head *head = &work_list;
	438	int index = base->timer_jiffies & TVR_MASK;
	439
	440	/*
	441	* Cascade timers:
	442	*/
	443	if (!index &&
	444	(!cascade(base, &base->tv2, INDEX(0))) &&
	445	(!cascade(base, &base->tv3, INDEX(1))) &&
	446	!cascade(base, &base->tv4, INDEX(2)))
	447	cascade(base, &base->tv5, INDEX(3));
	448	++base->timer_jiffies;
	449	list_splice_init(base->tv1.vec + index, &work_list);
	450	while (!list_empty(head)) {
	451	void (*fn)(unsigned long);
	452	unsigned long data;
	453
	454	timer = list_entry(head->next,struct timer_list,entry);
	455	fn = timer->function;
	456	data = timer->data;
	457
	458	set_running_timer(base, timer);
	459	detach_timer(timer, 1);
	460	spin_unlock_irq(&base->t_base.lock);
	461	{
	462	int preempt_count = preempt_count();
	463	fn(data);
	464	if (preempt_count != preempt_count()) {
	465	printk(KERN_WARNING "huh, entered %p "
	466	"with preempt_count %08x, exited"
	467	" with %08x?\n",
	468	fn, preempt_count,
	469	preempt_count());
	470	BUG();
	471	}
	472	}
	473	spin_lock_irq(&base->t_base.lock);
	474	}
	475	}
	476	set_running_timer(base, NULL);
	477	spin_unlock_irq(&base->t_base.lock);
	478	}
	479
	480	#ifdef CONFIG_NO_IDLE_HZ
	481	/*
	482	* Find out when the next timer event is due to happen. This
	483	* is used on S/390 to stop all activity when a cpus is idle.
	484	* This functions needs to be called disabled.
	485	*/
	486	unsigned long next_timer_interrupt(void)
	487	{
	488	tvec_base_t *base;
	489	struct list_head *list;
	490	struct timer_list *nte;
	491	unsigned long expires;
	492	tvec_t *varray[4];
	493	int i, j;
	494
	495	base = &__get_cpu_var(tvec_bases);
	496	spin_lock(&base->t_base.lock);
	497	expires = base->timer_jiffies + (LONG_MAX >> 1);
	498	list = NULL;
	499
	500	/* Look for timer events in tv1. */
	501	j = base->timer_jiffies & TVR_MASK;
	502	do {
	503	list_for_each_entry(nte, base->tv1.vec + j, entry) {
	504	expires = nte->expires;
	505	if (j < (base->timer_jiffies & TVR_MASK))
	506	list = base->tv2.vec + (INDEX(0));
	507	goto found;
	508	}
	509	j = (j + 1) & TVR_MASK;
	510	} while (j != (base->timer_jiffies & TVR_MASK));
	511
	512	/* Check tv2-tv5. */
	513	varray[0] = &base->tv2;
	514	varray[1] = &base->tv3;
	515	varray[2] = &base->tv4;
	516	varray[3] = &base->tv5;
	517	for (i = 0; i < 4; i++) {
	518	j = INDEX(i);
	519	do {
	520	if (list_empty(varray[i]->vec + j)) {
	521	j = (j + 1) & TVN_MASK;
	522	continue;
	523	}
	524	list_for_each_entry(nte, varray[i]->vec + j, entry)
	525	if (time_before(nte->expires, expires))
	526	expires = nte->expires;
	527	if (j < (INDEX(i)) && i < 3)
	528	list = varray[i + 1]->vec + (INDEX(i + 1));
	529	goto found;
	530	} while (j != (INDEX(i)));
	531	}
	532	found:
	533	if (list) {
	534	/*
	535	* The search wrapped. We need to look at the next list
	536	* from next tv element that would cascade into tv element
	537	* where we found the timer element.
	538	*/
	539	list_for_each_entry(nte, list, entry) {
	540	if (time_before(nte->expires, expires))
	541	expires = nte->expires;
	542	}
	543	}
	544	spin_unlock(&base->t_base.lock);
	545	return expires;
	546	}
	547	#endif
	548
	549	/******************************************************************/
	550
	551	/*
	552	* Timekeeping variables
	553	*/
	554	unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
	555	unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
	556
	557	/*
	558	* The current time
	559	* wall_to_monotonic is what we need to add to xtime (or xtime corrected
	560	* for sub jiffie times) to get to monotonic time. Monotonic is pegged
	561	* at zero at system boot time, so wall_to_monotonic will be negative,
	562	* however, we will ALWAYS keep the tv_nsec part positive so we can use
	563	* the usual normalization.
	564	*/
	565	struct timespec xtime __attribute__ ((aligned (16)));
	566	struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
	567
	568	EXPORT_SYMBOL(xtime);
	569
	570	/* Don't completely fail for HZ > 500. */
	571	int tickadj = 500/HZ ? : 1; /* microsecs */
	572
	573
	574	/*
	575	* phase-lock loop variables
	576	*/
	577	/* TIME_ERROR prevents overwriting the CMOS clock */
	578	int time_state = TIME_OK; /* clock synchronization status */
	579	int time_status = STA_UNSYNC; /* clock status bits */
	580	long time_offset; /* time adjustment (us) */
	581	long time_constant = 2; /* pll time constant */
	582	long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
	583	long time_precision = 1; /* clock precision (us) */
	584	long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
	585	long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
	586	static long time_phase; /* phase offset (scaled us) */
	587	long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
	588	/* frequency offset (scaled ppm)*/
	589	static long time_adj; /* tick adjust (scaled 1 / HZ) */
	590	long time_reftime; /* time at last adjustment (s) */
	591	long time_adjust;
	592	long time_next_adjust;
	593
	594	/*
	595	* this routine handles the overflow of the microsecond field
	596	*
	597	* The tricky bits of code to handle the accurate clock support
	598	* were provided by Dave Mills ([email protected]) of NTP fame.
	599	* They were originally developed for SUN and DEC kernels.
	600	* All the kudos should go to Dave for this stuff.
	601	*
	602	*/
	603	static void second_overflow(void)
	604	{
	605	long ltemp;
	606
	607	/* Bump the maxerror field */
	608	time_maxerror += time_tolerance >> SHIFT_USEC;
	609	if (time_maxerror > NTP_PHASE_LIMIT) {
	610	time_maxerror = NTP_PHASE_LIMIT;
	611	time_status \|= STA_UNSYNC;
	612	}
	613
	614	/*
	615	* Leap second processing. If in leap-insert state at the end of the
	616	* day, the system clock is set back one second; if in leap-delete
	617	* state, the system clock is set ahead one second. The microtime()
	618	* routine or external clock driver will insure that reported time is
	619	* always monotonic. The ugly divides should be replaced.
	620	*/
	621	switch (time_state) {
	622	case TIME_OK:
	623	if (time_status & STA_INS)
	624	time_state = TIME_INS;
	625	else if (time_status & STA_DEL)
	626	time_state = TIME_DEL;
	627	break;
	628	case TIME_INS:
	629	if (xtime.tv_sec % 86400 == 0) {
	630	xtime.tv_sec--;
	631	wall_to_monotonic.tv_sec++;
	632	/*
	633	* The timer interpolator will make time change
	634	* gradually instead of an immediate jump by one second
	635	*/
	636	time_interpolator_update(-NSEC_PER_SEC);
	637	time_state = TIME_OOP;
	638	clock_was_set();
	639	printk(KERN_NOTICE "Clock: inserting leap second "
	640	"23:59:60 UTC\n");
	641	}
	642	break;
	643	case TIME_DEL:
	644	if ((xtime.tv_sec + 1) % 86400 == 0) {
	645	xtime.tv_sec++;
	646	wall_to_monotonic.tv_sec--;
	647	/*
	648	* Use of time interpolator for a gradual change of
	649	* time
	650	*/
	651	time_interpolator_update(NSEC_PER_SEC);
	652	time_state = TIME_WAIT;
	653	clock_was_set();
	654	printk(KERN_NOTICE "Clock: deleting leap second "
	655	"23:59:59 UTC\n");
	656	}
	657	break;
	658	case TIME_OOP:
	659	time_state = TIME_WAIT;
	660	break;
	661	case TIME_WAIT:
	662	if (!(time_status & (STA_INS \| STA_DEL)))
	663	time_state = TIME_OK;
	664	}
	665
	666	/*
	667	* Compute the phase adjustment for the next second. In PLL mode, the
	668	* offset is reduced by a fixed factor times the time constant. In FLL
	669	* mode the offset is used directly. In either mode, the maximum phase
	670	* adjustment for each second is clamped so as to spread the adjustment
	671	* over not more than the number of seconds between updates.
	672	*/
	673	ltemp = time_offset;
	674	if (!(time_status & STA_FLL))
	675	ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
	676	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
	677	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
	678	time_offset -= ltemp;
	679	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
	680
	681	/*
	682	* Compute the frequency estimate and additional phase adjustment due
	683	* to frequency error for the next second. When the PPS signal is
	684	* engaged, gnaw on the watchdog counter and update the frequency
	685	* computed by the pll and the PPS signal.
	686	*/
	687	pps_valid++;
	688	if (pps_valid == PPS_VALID) { /* PPS signal lost */
	689	pps_jitter = MAXTIME;
	690	pps_stabil = MAXFREQ;
	691	time_status &= ~(STA_PPSSIGNAL \| STA_PPSJITTER \|
	692	STA_PPSWANDER \| STA_PPSERROR);
	693	}
	694	ltemp = time_freq + pps_freq;
	695	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
	696
	697	#if HZ == 100
	698	/*
	699	* Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
	700	* get 128.125; => only 0.125% error (p. 14)
	701	*/
	702	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
	703	#endif
	704	#if HZ == 250
	705	/*
	706	* Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
	707	* 0.78125% to get 255.85938; => only 0.05% error (p. 14)
	708	*/
	709	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
	710	#endif
	711	#if HZ == 1000
	712	/*
	713	* Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
	714	* 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
	715	*/
	716	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
	717	#endif
	718	}
	719
	720	/*
	721	* Returns how many microseconds we need to add to xtime this tick
	722	* in doing an adjustment requested with adjtime.
	723	*/
	724	static long adjtime_adjustment(void)
	725	{
	726	long time_adjust_step;
	727
	728	time_adjust_step = time_adjust;
	729	if (time_adjust_step) {
	730	/*
	731	* We are doing an adjtime thing. Prepare time_adjust_step to
	732	* be within bounds. Note that a positive time_adjust means we
	733	* want the clock to run faster.
	734	*
	735	* Limit the amount of the step to be in the range
	736	* -tickadj .. +tickadj
	737	*/
	738	time_adjust_step = min(time_adjust_step, (long)tickadj);
	739	time_adjust_step = max(time_adjust_step, (long)-tickadj);
	740	}
	741	return time_adjust_step;
	742	}
	743
	744	/* in the NTP reference this is called "hardclock()" */
	745	static void update_wall_time_one_tick(void)
	746	{
	747	long time_adjust_step, delta_nsec;
	748
	749	time_adjust_step = adjtime_adjustment();
	750	if (time_adjust_step)
	751	/* Reduce by this step the amount of time left */
	752	time_adjust -= time_adjust_step;
	753	delta_nsec = tick_nsec + time_adjust_step * 1000;
	754	/*
	755	* Advance the phase, once it gets to one microsecond, then
	756	* advance the tick more.
	757	*/
	758	time_phase += time_adj;
	759	if ((time_phase >= FINENSEC) \|\| (time_phase <= -FINENSEC)) {
	760	long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
	761	time_phase -= ltemp << (SHIFT_SCALE - 10);
	762	delta_nsec += ltemp;
	763	}
	764	xtime.tv_nsec += delta_nsec;
	765	time_interpolator_update(delta_nsec);
	766
	767	/* Changes by adjtime() do not take effect till next tick. */
	768	if (time_next_adjust != 0) {
	769	time_adjust = time_next_adjust;
	770	time_next_adjust = 0;
	771	}
	772	}
	773
	774	/*
	775	* Return how long ticks are at the moment, that is, how much time
	776	* update_wall_time_one_tick will add to xtime next time we call it
	777	* (assuming no calls to do_adjtimex in the meantime).
	778	* The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
	779	* bits to the right of the binary point.
	780	* This function has no side-effects.
	781	*/
	782	u64 current_tick_length(void)
	783	{
	784	long delta_nsec;
	785
	786	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
	787	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
	788	}
	789
	790	/*
	791	* Using a loop looks inefficient, but "ticks" is
	792	* usually just one (we shouldn't be losing ticks,
	793	* we're doing this this way mainly for interrupt
	794	* latency reasons, not because we think we'll
	795	* have lots of lost timer ticks
	796	*/
	797	static void update_wall_time(unsigned long ticks)
	798	{
	799	do {
	800	ticks--;
	801	update_wall_time_one_tick();
	802	if (xtime.tv_nsec >= 1000000000) {
	803	xtime.tv_nsec -= 1000000000;
	804	xtime.tv_sec++;
	805	second_overflow();
	806	}
	807	} while (ticks);
	808	}
	809
	810	/*
	811	* Called from the timer interrupt handler to charge one tick to the current
	812	* process. user_tick is 1 if the tick is user time, 0 for system.
	813	*/
	814	void update_process_times(int user_tick)
	815	{
	816	struct task_struct *p = current;
	817	int cpu = smp_processor_id();
	818
	819	/* Note: this timer irq context must be accounted for as well. */
	820	if (user_tick)
	821	account_user_time(p, jiffies_to_cputime(1));
	822	else
	823	account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
	824	run_local_timers();
	825	if (rcu_pending(cpu))
	826	rcu_check_callbacks(cpu, user_tick);
	827	scheduler_tick();
	828	run_posix_cpu_timers(p);
	829	}
	830
	831	/*
	832	* Nr of active tasks - counted in fixed-point numbers
	833	*/
	834	static unsigned long count_active_tasks(void)
	835	{
	836	return (nr_running() + nr_uninterruptible()) * FIXED_1;
	837	}
	838
	839	/*
	840	* Hmm.. Changed this, as the GNU make sources (load.c) seems to
	841	* imply that avenrun[] is the standard name for this kind of thing.
	842	* Nothing else seems to be standardized: the fractional size etc
	843	* all seem to differ on different machines.
	844	*
	845	* Requires xtime_lock to access.
	846	*/
	847	unsigned long avenrun[3];
	848
	849	EXPORT_SYMBOL(avenrun);
	850
	851	/*
	852	* calc_load - given tick count, update the avenrun load estimates.
	853	* This is called while holding a write_lock on xtime_lock.
	854	*/
	855	static inline void calc_load(unsigned long ticks)
	856	{
	857	unsigned long active_tasks; /* fixed-point */
	858	static int count = LOAD_FREQ;
	859
	860	count -= ticks;
	861	if (count < 0) {
	862	count += LOAD_FREQ;
	863	active_tasks = count_active_tasks();
	864	CALC_LOAD(avenrun[0], EXP_1, active_tasks);
	865	CALC_LOAD(avenrun[1], EXP_5, active_tasks);
	866	CALC_LOAD(avenrun[2], EXP_15, active_tasks);
	867	}
	868	}
	869
	870	/* jiffies at the most recent update of wall time */
	871	unsigned long wall_jiffies = INITIAL_JIFFIES;
	872
	873	/*
	874	* This read-write spinlock protects us from races in SMP while
	875	* playing with xtime and avenrun.
	876	*/
	877	#ifndef ARCH_HAVE_XTIME_LOCK
	878	seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
	879
	880	EXPORT_SYMBOL(xtime_lock);
	881	#endif
	882
	883	/*
	884	* This function runs timers and the timer-tq in bottom half context.
	885	*/
	886	static void run_timer_softirq(struct softirq_action *h)
	887	{
	888	tvec_base_t *base = &__get_cpu_var(tvec_bases);
	889
	890	hrtimer_run_queues();
	891	if (time_after_eq(jiffies, base->timer_jiffies))
	892	__run_timers(base);
	893	}
	894
	895	/*
	896	* Called by the local, per-CPU timer interrupt on SMP.
	897	*/
	898	void run_local_timers(void)
	899	{
	900	raise_softirq(TIMER_SOFTIRQ);
	901	}
	902
	903	/*
	904	* Called by the timer interrupt. xtime_lock must already be taken
	905	* by the timer IRQ!
	906	*/
	907	static inline void update_times(void)
	908	{
	909	unsigned long ticks;
	910
	911	ticks = jiffies - wall_jiffies;
	912	if (ticks) {
	913	wall_jiffies += ticks;
	914	update_wall_time(ticks);
	915	}
	916	calc_load(ticks);
	917	}
	918
	919	/*
	920	* The 64-bit jiffies value is not atomic - you MUST NOT read it
	921	* without sampling the sequence number in xtime_lock.
	922	* jiffies is defined in the linker script...
	923	*/
	924
	925	void do_timer(struct pt_regs *regs)
	926	{
	927	jiffies_64++;
	928	update_times();
	929	softlockup_tick(regs);
	930	}
	931
	932	#ifdef __ARCH_WANT_SYS_ALARM
	933
	934	/*
	935	* For backwards compatibility? This can be done in libc so Alpha
	936	* and all newer ports shouldn't need it.
	937	*/
	938	asmlinkage unsigned long sys_alarm(unsigned int seconds)
	939	{
	940	struct itimerval it_new, it_old;
	941	unsigned int oldalarm;
	942
	943	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
	944	it_new.it_value.tv_sec = seconds;
	945	it_new.it_value.tv_usec = 0;
	946	do_setitimer(ITIMER_REAL, &it_new, &it_old);
	947	oldalarm = it_old.it_value.tv_sec;
	948	/* ehhh.. We can't return 0 if we have an alarm pending.. */
	949	/* And we'd better return too much than too little anyway */
	950	if ((!oldalarm && it_old.it_value.tv_usec) \|\| it_old.it_value.tv_usec >= 500000)
	951	oldalarm++;
	952	return oldalarm;
	953	}
	954
	955	#endif
	956
	957	#ifndef __alpha__
	958
	959	/*
	960	* The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
	961	* should be moved into arch/i386 instead?
	962	*/
	963
	964	/**
	965	* sys_getpid - return the thread group id of the current process
	966	*
	967	* Note, despite the name, this returns the tgid not the pid. The tgid and
	968	* the pid are identical unless CLONE_THREAD was specified on clone() in
	969	* which case the tgid is the same in all threads of the same group.
	970	*
	971	* This is SMP safe as current->tgid does not change.
	972	*/
	973	asmlinkage long sys_getpid(void)
	974	{
	975	return current->tgid;
	976	}
	977
	978	/*
	979	* Accessing ->group_leader->real_parent is not SMP-safe, it could
	980	* change from under us. However, rather than getting any lock
	981	* we can use an optimistic algorithm: get the parent
	982	* pid, and go back and check that the parent is still
	983	* the same. If it has changed (which is extremely unlikely
	984	* indeed), we just try again..
	985	*
	986	* NOTE! This depends on the fact that even if we _do_
	987	* get an old value of "parent", we can happily dereference
	988	* the pointer (it was and remains a dereferencable kernel pointer
	989	* no matter what): we just can't necessarily trust the result
	990	* until we know that the parent pointer is valid.
	991	*
	992	* NOTE2: ->group_leader never changes from under us.
	993	*/
	994	asmlinkage long sys_getppid(void)
	995	{
	996	int pid;
	997	struct task_struct *me = current;
	998	struct task_struct *parent;
	999
	1000	parent = me->group_leader->real_parent;
	1001	for (;;) {
	1002	pid = parent->tgid;
	1003	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)
	1004	{
	1005	struct task_struct *old = parent;
	1006
	1007	/*
	1008	* Make sure we read the pid before re-reading the
	1009	* parent pointer:
	1010	*/
	1011	smp_rmb();
	1012	parent = me->group_leader->real_parent;
	1013	if (old != parent)
	1014	continue;
	1015	}
	1016	#endif
	1017	break;
	1018	}
	1019	return pid;
	1020	}
	1021
	1022	asmlinkage long sys_getuid(void)
	1023	{
	1024	/* Only we change this so SMP safe */
	1025	return current->uid;
	1026	}
	1027
	1028	asmlinkage long sys_geteuid(void)
	1029	{
	1030	/* Only we change this so SMP safe */
	1031	return current->euid;
	1032	}
	1033
	1034	asmlinkage long sys_getgid(void)
	1035	{
	1036	/* Only we change this so SMP safe */
	1037	return current->gid;
	1038	}
	1039
	1040	asmlinkage long sys_getegid(void)
	1041	{
	1042	/* Only we change this so SMP safe */
	1043	return current->egid;
	1044	}
	1045
	1046	#endif
	1047
	1048	static void process_timeout(unsigned long __data)
	1049	{
	1050	wake_up_process((task_t *)__data);
	1051	}
	1052
	1053	/**
	1054	* schedule_timeout - sleep until timeout
	1055	* @timeout: timeout value in jiffies
	1056	*
	1057	* Make the current task sleep until @timeout jiffies have
	1058	* elapsed. The routine will return immediately unless
	1059	* the current task state has been set (see set_current_state()).
	1060	*
	1061	* You can set the task state as follows -
	1062	*
	1063	* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
	1064	* pass before the routine returns. The routine will return 0
	1065	*
	1066	* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
	1067	* delivered to the current task. In this case the remaining time
	1068	* in jiffies will be returned, or 0 if the timer expired in time
	1069	*
	1070	* The current task state is guaranteed to be TASK_RUNNING when this
	1071	* routine returns.
	1072	*
	1073	* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
	1074	* the CPU away without a bound on the timeout. In this case the return
	1075	* value will be %MAX_SCHEDULE_TIMEOUT.
	1076	*
	1077	* In all cases the return value is guaranteed to be non-negative.
	1078	*/
	1079	fastcall signed long __sched schedule_timeout(signed long timeout)
	1080	{
	1081	struct timer_list timer;
	1082	unsigned long expire;
	1083
	1084	switch (timeout)
	1085	{
	1086	case MAX_SCHEDULE_TIMEOUT:
	1087	/*
	1088	* These two special cases are useful to be comfortable
	1089	* in the caller. Nothing more. We could take
	1090	* MAX_SCHEDULE_TIMEOUT from one of the negative value
	1091	* but I' d like to return a valid offset (>=0) to allow
	1092	* the caller to do everything it want with the retval.
	1093	*/
	1094	schedule();
	1095	goto out;
	1096	default:
	1097	/*
	1098	* Another bit of PARANOID. Note that the retval will be
	1099	* 0 since no piece of kernel is supposed to do a check
	1100	* for a negative retval of schedule_timeout() (since it
	1101	* should never happens anyway). You just have the printk()
	1102	* that will tell you if something is gone wrong and where.
	1103	*/
	1104	if (timeout < 0)
	1105	{
	1106	printk(KERN_ERR "schedule_timeout: wrong timeout "
	1107	"value %lx from %p\n", timeout,
	1108	__builtin_return_address(0));
	1109	current->state = TASK_RUNNING;
	1110	goto out;
	1111	}
	1112	}
	1113
	1114	expire = timeout + jiffies;
	1115
	1116	setup_timer(&timer, process_timeout, (unsigned long)current);
	1117	__mod_timer(&timer, expire);
	1118	schedule();
	1119	del_singleshot_timer_sync(&timer);
	1120
	1121	timeout = expire - jiffies;
	1122
	1123	out:
	1124	return timeout < 0 ? 0 : timeout;
	1125	}
	1126	EXPORT_SYMBOL(schedule_timeout);
	1127
	1128	/*
	1129	* We can use __set_current_state() here because schedule_timeout() calls
	1130	* schedule() unconditionally.
	1131	*/
	1132	signed long __sched schedule_timeout_interruptible(signed long timeout)
	1133	{
	1134	__set_current_state(TASK_INTERRUPTIBLE);
	1135	return schedule_timeout(timeout);
	1136	}
	1137	EXPORT_SYMBOL(schedule_timeout_interruptible);
	1138
	1139	signed long __sched schedule_timeout_uninterruptible(signed long timeout)
	1140	{
	1141	__set_current_state(TASK_UNINTERRUPTIBLE);
	1142	return schedule_timeout(timeout);
	1143	}
	1144	EXPORT_SYMBOL(schedule_timeout_uninterruptible);
	1145
	1146	/* Thread ID - the internal kernel "pid" */
	1147	asmlinkage long sys_gettid(void)
	1148	{
	1149	return current->pid;
	1150	}
	1151
	1152	/*
	1153	* sys_sysinfo - fill in sysinfo struct
	1154	*/
	1155	asmlinkage long sys_sysinfo(struct sysinfo __user *info)
	1156	{
	1157	struct sysinfo val;
	1158	unsigned long mem_total, sav_total;
	1159	unsigned int mem_unit, bitcount;
	1160	unsigned long seq;
	1161
	1162	memset((char *)&val, 0, sizeof(struct sysinfo));
	1163
	1164	do {
	1165	struct timespec tp;
	1166	seq = read_seqbegin(&xtime_lock);
	1167
	1168	/*
	1169	* This is annoying. The below is the same thing
	1170	* posix_get_clock_monotonic() does, but it wants to
	1171	* take the lock which we want to cover the loads stuff
	1172	* too.
	1173	*/
	1174
	1175	getnstimeofday(&tp);
	1176	tp.tv_sec += wall_to_monotonic.tv_sec;
	1177	tp.tv_nsec += wall_to_monotonic.tv_nsec;
	1178	if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
	1179	tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
	1180	tp.tv_sec++;
	1181	}
	1182	val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
	1183
	1184	val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
	1185	val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
	1186	val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
	1187
	1188	val.procs = nr_threads;
	1189	} while (read_seqretry(&xtime_lock, seq));
	1190
	1191	si_meminfo(&val);
	1192	si_swapinfo(&val);
	1193
	1194	/*
	1195	* If the sum of all the available memory (i.e. ram + swap)
	1196	* is less than can be stored in a 32 bit unsigned long then
	1197	* we can be binary compatible with 2.2.x kernels. If not,
	1198	* well, in that case 2.2.x was broken anyways...
	1199	*
	1200	* -Erik Andersen <[email protected]>
	1201	*/
	1202
	1203	mem_total = val.totalram + val.totalswap;
	1204	if (mem_total < val.totalram \|\| mem_total < val.totalswap)
	1205	goto out;
	1206	bitcount = 0;
	1207	mem_unit = val.mem_unit;
	1208	while (mem_unit > 1) {
	1209	bitcount++;
	1210	mem_unit >>= 1;
	1211	sav_total = mem_total;
	1212	mem_total <<= 1;
	1213	if (mem_total < sav_total)
	1214	goto out;
	1215	}
	1216
	1217	/*
	1218	* If mem_total did not overflow, multiply all memory values by
	1219	* val.mem_unit and set it to 1. This leaves things compatible
	1220	* with 2.2.x, and also retains compatibility with earlier 2.4.x
	1221	* kernels...
	1222	*/
	1223
	1224	val.mem_unit = 1;
	1225	val.totalram <<= bitcount;
	1226	val.freeram <<= bitcount;
	1227	val.sharedram <<= bitcount;
	1228	val.bufferram <<= bitcount;
	1229	val.totalswap <<= bitcount;
	1230	val.freeswap <<= bitcount;
	1231	val.totalhigh <<= bitcount;
	1232	val.freehigh <<= bitcount;
	1233
	1234	out:
	1235	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
	1236	return -EFAULT;
	1237
	1238	return 0;
	1239	}
	1240
	1241	static void __devinit init_timers_cpu(int cpu)
	1242	{
	1243	int j;
	1244	tvec_base_t *base;
	1245
	1246	base = &per_cpu(tvec_bases, cpu);
	1247	spin_lock_init(&base->t_base.lock);
	1248	for (j = 0; j < TVN_SIZE; j++) {
	1249	INIT_LIST_HEAD(base->tv5.vec + j);
	1250	INIT_LIST_HEAD(base->tv4.vec + j);
	1251	INIT_LIST_HEAD(base->tv3.vec + j);
	1252	INIT_LIST_HEAD(base->tv2.vec + j);
	1253	}
	1254	for (j = 0; j < TVR_SIZE; j++)
	1255	INIT_LIST_HEAD(base->tv1.vec + j);
	1256
	1257	base->timer_jiffies = jiffies;
	1258	}
	1259
	1260	#ifdef CONFIG_HOTPLUG_CPU
	1261	static void migrate_timer_list(tvec_base_t new_base, struct list_head head)
	1262	{
	1263	struct timer_list *timer;
	1264
	1265	while (!list_empty(head)) {
	1266	timer = list_entry(head->next, struct timer_list, entry);
	1267	detach_timer(timer, 0);
	1268	timer->base = &new_base->t_base;
	1269	internal_add_timer(new_base, timer);
	1270	}
	1271	}
	1272
	1273	static void __devinit migrate_timers(int cpu)
	1274	{
	1275	tvec_base_t *old_base;
	1276	tvec_base_t *new_base;
	1277	int i;
	1278
	1279	BUG_ON(cpu_online(cpu));
	1280	old_base = &per_cpu(tvec_bases, cpu);
	1281	new_base = &get_cpu_var(tvec_bases);
	1282
	1283	local_irq_disable();
	1284	spin_lock(&new_base->t_base.lock);
	1285	spin_lock(&old_base->t_base.lock);
	1286
	1287	if (old_base->t_base.running_timer)
	1288	BUG();
	1289	for (i = 0; i < TVR_SIZE; i++)
	1290	migrate_timer_list(new_base, old_base->tv1.vec + i);
	1291	for (i = 0; i < TVN_SIZE; i++) {
	1292	migrate_timer_list(new_base, old_base->tv2.vec + i);
	1293	migrate_timer_list(new_base, old_base->tv3.vec + i);
	1294	migrate_timer_list(new_base, old_base->tv4.vec + i);
	1295	migrate_timer_list(new_base, old_base->tv5.vec + i);
	1296	}
	1297
	1298	spin_unlock(&old_base->t_base.lock);
	1299	spin_unlock(&new_base->t_base.lock);
	1300	local_irq_enable();
	1301	put_cpu_var(tvec_bases);
	1302	}
	1303	#endif /* CONFIG_HOTPLUG_CPU */
	1304
	1305	static int __devinit timer_cpu_notify(struct notifier_block *self,
	1306	unsigned long action, void *hcpu)
	1307	{
	1308	long cpu = (long)hcpu;
	1309	switch(action) {
	1310	case CPU_UP_PREPARE:
	1311	init_timers_cpu(cpu);
	1312	break;
	1313	#ifdef CONFIG_HOTPLUG_CPU
	1314	case CPU_DEAD:
	1315	migrate_timers(cpu);
	1316	break;
	1317	#endif
	1318	default:
	1319	break;
	1320	}
	1321	return NOTIFY_OK;
	1322	}
	1323
	1324	static struct notifier_block __devinitdata timers_nb = {
	1325	.notifier_call = timer_cpu_notify,
	1326	};
	1327
	1328
	1329	void __init init_timers(void)
	1330	{
	1331	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
	1332	(void *)(long)smp_processor_id());
	1333	register_cpu_notifier(&timers_nb);
	1334	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
	1335	}
	1336
	1337	#ifdef CONFIG_TIME_INTERPOLATION
	1338
	1339	struct time_interpolator *time_interpolator;
	1340	static struct time_interpolator *time_interpolator_list;
	1341	static DEFINE_SPINLOCK(time_interpolator_lock);
	1342
	1343	static inline u64 time_interpolator_get_cycles(unsigned int src)
	1344	{
	1345	unsigned long (*x)(void);
	1346
	1347	switch (src)
	1348	{
	1349	case TIME_SOURCE_FUNCTION:
	1350	x = time_interpolator->addr;
	1351	return x();
	1352
	1353	case TIME_SOURCE_MMIO64 :
	1354	return readq_relaxed((void __iomem *)time_interpolator->addr);
	1355
	1356	case TIME_SOURCE_MMIO32 :
	1357	return readl_relaxed((void __iomem *)time_interpolator->addr);
	1358
	1359	default: return get_cycles();
	1360	}
	1361	}
	1362
	1363	static inline u64 time_interpolator_get_counter(int writelock)
	1364	{
	1365	unsigned int src = time_interpolator->source;
	1366
	1367	if (time_interpolator->jitter)
	1368	{
	1369	u64 lcycle;
	1370	u64 now;
	1371
	1372	do {
	1373	lcycle = time_interpolator->last_cycle;
	1374	now = time_interpolator_get_cycles(src);
	1375	if (lcycle && time_after(lcycle, now))
	1376	return lcycle;
	1377
	1378	/* When holding the xtime write lock, there's no need
	1379	* to add the overhead of the cmpxchg. Readers are
	1380	* force to retry until the write lock is released.
	1381	*/
	1382	if (writelock) {
	1383	time_interpolator->last_cycle = now;
	1384	return now;
	1385	}
	1386	/* Keep track of the last timer value returned. The use of cmpxchg here
	1387	* will cause contention in an SMP environment.
	1388	*/
	1389	} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
	1390	return now;
	1391	}
	1392	else
	1393	return time_interpolator_get_cycles(src);
	1394	}
	1395
	1396	void time_interpolator_reset(void)
	1397	{
	1398	time_interpolator->offset = 0;
	1399	time_interpolator->last_counter = time_interpolator_get_counter(1);
	1400	}
	1401
	1402	#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
	1403
	1404	unsigned long time_interpolator_get_offset(void)
	1405	{
	1406	/* If we do not have a time interpolator set up then just return zero */
	1407	if (!time_interpolator)
	1408	return 0;
	1409
	1410	return time_interpolator->offset +
	1411	GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
	1412	}
	1413
	1414	#define INTERPOLATOR_ADJUST 65536
	1415	#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
	1416
	1417	static void time_interpolator_update(long delta_nsec)
	1418	{
	1419	u64 counter;
	1420	unsigned long offset;
	1421
	1422	/* If there is no time interpolator set up then do nothing */
	1423	if (!time_interpolator)
	1424	return;
	1425
	1426	/*
	1427	* The interpolator compensates for late ticks by accumulating the late
	1428	* time in time_interpolator->offset. A tick earlier than expected will
	1429	* lead to a reset of the offset and a corresponding jump of the clock
	1430	* forward. Again this only works if the interpolator clock is running
	1431	* slightly slower than the regular clock and the tuning logic insures
	1432	* that.
	1433	*/
	1434
	1435	counter = time_interpolator_get_counter(1);
	1436	offset = time_interpolator->offset +
	1437	GET_TI_NSECS(counter, time_interpolator);
	1438
	1439	if (delta_nsec < 0 \|\| (unsigned long) delta_nsec < offset)
	1440	time_interpolator->offset = offset - delta_nsec;
	1441	else {
	1442	time_interpolator->skips++;
	1443	time_interpolator->ns_skipped += delta_nsec - offset;
	1444	time_interpolator->offset = 0;
	1445	}
	1446	time_interpolator->last_counter = counter;
	1447
	1448	/* Tuning logic for time interpolator invoked every minute or so.
	1449	* Decrease interpolator clock speed if no skips occurred and an offset is carried.
	1450	* Increase interpolator clock speed if we skip too much time.
	1451	*/
	1452	if (jiffies % INTERPOLATOR_ADJUST == 0)
	1453	{
	1454	if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
	1455	time_interpolator->nsec_per_cyc--;
	1456	if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
	1457	time_interpolator->nsec_per_cyc++;
	1458	time_interpolator->skips = 0;
	1459	time_interpolator->ns_skipped = 0;
	1460	}
	1461	}
	1462
	1463	static inline int
	1464	is_better_time_interpolator(struct time_interpolator *new)
	1465	{
	1466	if (!time_interpolator)
	1467	return 1;
	1468	return new->frequency > 2*time_interpolator->frequency \|\|
	1469	(unsigned long)new->drift < (unsigned long)time_interpolator->drift;
	1470	}
	1471
	1472	void
	1473	register_time_interpolator(struct time_interpolator *ti)
	1474	{
	1475	unsigned long flags;
	1476
	1477	/* Sanity check */
	1478	if (ti->frequency == 0 \|\| ti->mask == 0)
	1479	BUG();
	1480
	1481	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
	1482	spin_lock(&time_interpolator_lock);
	1483	write_seqlock_irqsave(&xtime_lock, flags);
	1484	if (is_better_time_interpolator(ti)) {
	1485	time_interpolator = ti;
	1486	time_interpolator_reset();
	1487	}
	1488	write_sequnlock_irqrestore(&xtime_lock, flags);
	1489
	1490	ti->next = time_interpolator_list;
	1491	time_interpolator_list = ti;
	1492	spin_unlock(&time_interpolator_lock);
	1493	}
	1494
	1495	void
	1496	unregister_time_interpolator(struct time_interpolator *ti)
	1497	{
	1498	struct time_interpolator curr, *prev;
	1499	unsigned long flags;
	1500
	1501	spin_lock(&time_interpolator_lock);
	1502	prev = &time_interpolator_list;
	1503	for (curr = *prev; curr; curr = curr->next) {
	1504	if (curr == ti) {
	1505	*prev = curr->next;
	1506	break;
	1507	}
	1508	prev = &curr->next;
	1509	}
	1510
	1511	write_seqlock_irqsave(&xtime_lock, flags);
	1512	if (ti == time_interpolator) {
	1513	/* we lost the best time-interpolator: */
	1514	time_interpolator = NULL;
	1515	/* find the next-best interpolator */
	1516	for (curr = time_interpolator_list; curr; curr = curr->next)
	1517	if (is_better_time_interpolator(curr))
	1518	time_interpolator = curr;
	1519	time_interpolator_reset();
	1520	}
	1521	write_sequnlock_irqrestore(&xtime_lock, flags);
	1522	spin_unlock(&time_interpolator_lock);
	1523	}
	1524	#endif /* CONFIG_TIME_INTERPOLATION */
	1525
	1526	/**
	1527	* msleep - sleep safely even with waitqueue interruptions
	1528	* @msecs: Time in milliseconds to sleep for
	1529	*/
	1530	void msleep(unsigned int msecs)
	1531	{
	1532	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
	1533
	1534	while (timeout)
	1535	timeout = schedule_timeout_uninterruptible(timeout);
	1536	}
	1537
	1538	EXPORT_SYMBOL(msleep);
	1539
	1540	/**
	1541	* msleep_interruptible - sleep waiting for signals
	1542	* @msecs: Time in milliseconds to sleep for
	1543	*/
	1544	unsigned long msleep_interruptible(unsigned int msecs)
	1545	{
	1546	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
	1547
	1548	while (timeout && !signal_pending(current))
	1549	timeout = schedule_timeout_interruptible(timeout);
	1550	return jiffies_to_msecs(timeout);
	1551	}
	1552
	1553	EXPORT_SYMBOL(msleep_interruptible);