Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* CFQ, or complete fairness queueing, disk scheduler.
	3	*
	4	* Based on ideas from a previously unfinished io
	5	* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
	6	*
	7	* Copyright (C) 2003 Jens Axboe <[email protected]>
	8	*/
	9	#include <linux/module.h>
	10	#include <linux/slab.h>
	11	#include <linux/blkdev.h>
	12	#include <linux/elevator.h>
	13	#include <linux/jiffies.h>
	14	#include <linux/rbtree.h>
	15	#include <linux/ioprio.h>
	16	#include <linux/blktrace_api.h>
	17	#include "blk.h"
	18	#include "blk-cgroup.h"
	19
	20	/*
	21	* tunables
	22	*/
	23	/* max queue in one round of service */
	24	static const int cfq_quantum = 8;
	25	static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
	26	/* maximum backwards seek, in KiB */
	27	static const int cfq_back_max = 16 * 1024;
	28	/* penalty of a backwards seek */
	29	static const int cfq_back_penalty = 2;
	30	static const int cfq_slice_sync = HZ / 10;
	31	static int cfq_slice_async = HZ / 25;
	32	static const int cfq_slice_async_rq = 2;
	33	static int cfq_slice_idle = HZ / 125;
	34	static int cfq_group_idle = HZ / 125;
	35	static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
	36	static const int cfq_hist_divisor = 4;
	37
	38	/*
	39	* offset from end of service tree
	40	*/
	41	#define CFQ_IDLE_DELAY (HZ / 5)
	42
	43	/*
	44	* below this threshold, we consider thinktime immediate
	45	*/
	46	#define CFQ_MIN_TT (2)
	47
	48	#define CFQ_SLICE_SCALE (5)
	49	#define CFQ_HW_QUEUE_MIN (5)
	50	#define CFQ_SERVICE_SHIFT 12
	51
	52	#define CFQQ_SEEK_THR (sector_t)(8 * 100)
	53	#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
	54	#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
	55	#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
	56
	57	#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
	58	#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
	59	#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
	60
	61	static struct kmem_cache *cfq_pool;
	62
	63	#define CFQ_PRIO_LISTS IOPRIO_BE_NR
	64	#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
	65	#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
	66
	67	#define sample_valid(samples) ((samples) > 80)
	68	#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
	69
	70	struct cfq_ttime {
	71	unsigned long last_end_request;
	72
	73	unsigned long ttime_total;
	74	unsigned long ttime_samples;
	75	unsigned long ttime_mean;
	76	};
	77
	78	/*
	79	* Most of our rbtree usage is for sorting with min extraction, so
	80	* if we cache the leftmost node we don't have to walk down the tree
	81	* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
	82	* move this into the elevator for the rq sorting as well.
	83	*/
	84	struct cfq_rb_root {
	85	struct rb_root rb;
	86	struct rb_node *left;
	87	unsigned count;
	88	u64 min_vdisktime;
	89	struct cfq_ttime ttime;
	90	};
	91	#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
	92	.ttime = {.last_end_request = jiffies,},}
	93
	94	/*
	95	* Per process-grouping structure
	96	*/
	97	struct cfq_queue {
	98	/* reference count */
	99	int ref;
	100	/* various state flags, see below */
	101	unsigned int flags;
	102	/* parent cfq_data */
	103	struct cfq_data *cfqd;
	104	/* service_tree member */
	105	struct rb_node rb_node;
	106	/* service_tree key */
	107	unsigned long rb_key;
	108	/* prio tree member */
	109	struct rb_node p_node;
	110	/* prio tree root we belong to, if any */
	111	struct rb_root *p_root;
	112	/* sorted list of pending requests */
	113	struct rb_root sort_list;
	114	/* if fifo isn't expired, next request to serve */
	115	struct request *next_rq;
	116	/* requests queued in sort_list */
	117	int queued[2];
	118	/* currently allocated requests */
	119	int allocated[2];
	120	/* fifo list of requests in sort_list */
	121	struct list_head fifo;
	122
	123	/* time when queue got scheduled in to dispatch first request. */
	124	unsigned long dispatch_start;
	125	unsigned int allocated_slice;
	126	unsigned int slice_dispatch;
	127	/* time when first request from queue completed and slice started. */
	128	unsigned long slice_start;
	129	unsigned long slice_end;
	130	long slice_resid;
	131
	132	/* pending priority requests */
	133	int prio_pending;
	134	/* number of requests that are on the dispatch list or inside driver */
	135	int dispatched;
	136
	137	/* io prio of this group */
	138	unsigned short ioprio, org_ioprio;
	139	unsigned short ioprio_class;
	140
	141	pid_t pid;
	142
	143	u32 seek_history;
	144	sector_t last_request_pos;
	145
	146	struct cfq_rb_root *service_tree;
	147	struct cfq_queue *new_cfqq;
	148	struct cfq_group *cfqg;
	149	/* Number of sectors dispatched from queue in single dispatch round */
	150	unsigned long nr_sectors;
	151	};
	152
	153	/*
	154	* First index in the service_trees.
	155	* IDLE is handled separately, so it has negative index
	156	*/
	157	enum wl_class_t {
	158	BE_WORKLOAD = 0,
	159	RT_WORKLOAD = 1,
	160	IDLE_WORKLOAD = 2,
	161	CFQ_PRIO_NR,
	162	};
	163
	164	/*
	165	* Second index in the service_trees.
	166	*/
	167	enum wl_type_t {
	168	ASYNC_WORKLOAD = 0,
	169	SYNC_NOIDLE_WORKLOAD = 1,
	170	SYNC_WORKLOAD = 2
	171	};
	172
	173	struct cfqg_stats {
	174	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	175	/* total bytes transferred */
	176	struct blkg_rwstat service_bytes;
	177	/* total IOs serviced, post merge */
	178	struct blkg_rwstat serviced;
	179	/* number of ios merged */
	180	struct blkg_rwstat merged;
	181	/* total time spent on device in ns, may not be accurate w/ queueing */
	182	struct blkg_rwstat service_time;
	183	/* total time spent waiting in scheduler queue in ns */
	184	struct blkg_rwstat wait_time;
	185	/* number of IOs queued up */
	186	struct blkg_rwstat queued;
	187	/* total sectors transferred */
	188	struct blkg_stat sectors;
	189	/* total disk time and nr sectors dispatched by this group */
	190	struct blkg_stat time;
	191	#ifdef CONFIG_DEBUG_BLK_CGROUP
	192	/* time not charged to this cgroup */
	193	struct blkg_stat unaccounted_time;
	194	/* sum of number of ios queued across all samples */
	195	struct blkg_stat avg_queue_size_sum;
	196	/* count of samples taken for average */
	197	struct blkg_stat avg_queue_size_samples;
	198	/* how many times this group has been removed from service tree */
	199	struct blkg_stat dequeue;
	200	/* total time spent waiting for it to be assigned a timeslice. */
	201	struct blkg_stat group_wait_time;
	202	/* time spent idling for this blkcg_gq */
	203	struct blkg_stat idle_time;
	204	/* total time with empty current active q with other requests queued */
	205	struct blkg_stat empty_time;
	206	/* fields after this shouldn't be cleared on stat reset */
	207	uint64_t start_group_wait_time;
	208	uint64_t start_idle_time;
	209	uint64_t start_empty_time;
	210	uint16_t flags;
	211	#endif /* CONFIG_DEBUG_BLK_CGROUP */
	212	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
	213	};
	214
	215	/* This is per cgroup per device grouping structure */
	216	struct cfq_group {
	217	/* must be the first member */
	218	struct blkg_policy_data pd;
	219
	220	/* group service_tree member */
	221	struct rb_node rb_node;
	222
	223	/* group service_tree key */
	224	u64 vdisktime;
	225
	226	/*
	227	* The number of active cfqgs and sum of their weights under this
	228	* cfqg. This covers this cfqg's leaf_weight and all children's
	229	* weights, but does not cover weights of further descendants.
	230	*
	231	* If a cfqg is on the service tree, it's active. An active cfqg
	232	* also activates its parent and contributes to the children_weight
	233	* of the parent.
	234	*/
	235	int nr_active;
	236	unsigned int children_weight;
	237
	238	/*
	239	* vfraction is the fraction of vdisktime that the tasks in this
	240	* cfqg are entitled to. This is determined by compounding the
	241	* ratios walking up from this cfqg to the root.
	242	*
	243	* It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
	244	* vfractions on a service tree is approximately 1. The sum may
	245	* deviate a bit due to rounding errors and fluctuations caused by
	246	* cfqgs entering and leaving the service tree.
	247	*/
	248	unsigned int vfraction;
	249
	250	/*
	251	* There are two weights - (internal) weight is the weight of this
	252	* cfqg against the sibling cfqgs. leaf_weight is the wight of
	253	* this cfqg against the child cfqgs. For the root cfqg, both
	254	* weights are kept in sync for backward compatibility.
	255	*/
	256	unsigned int weight;
	257	unsigned int new_weight;
	258	unsigned int dev_weight;
	259
	260	unsigned int leaf_weight;
	261	unsigned int new_leaf_weight;
	262	unsigned int dev_leaf_weight;
	263
	264	/* number of cfqq currently on this group */
	265	int nr_cfqq;
	266
	267	/*
	268	* Per group busy queues average. Useful for workload slice calc. We
	269	* create the array for each prio class but at run time it is used
	270	* only for RT and BE class and slot for IDLE class remains unused.
	271	* This is primarily done to avoid confusion and a gcc warning.
	272	*/
	273	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	274	/*
	275	* rr lists of queues with requests. We maintain service trees for
	276	* RT and BE classes. These trees are subdivided in subclasses
	277	* of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	278	* class there is no subclassification and all the cfq queues go on
	279	* a single tree service_tree_idle.
	280	* Counts are embedded in the cfq_rb_root
	281	*/
	282	struct cfq_rb_root service_trees[2][3];
	283	struct cfq_rb_root service_tree_idle;
	284
	285	unsigned long saved_wl_slice;
	286	enum wl_type_t saved_wl_type;
	287	enum wl_class_t saved_wl_class;
	288
	289	/* number of requests that are on the dispatch list or inside driver */
	290	int dispatched;
	291	struct cfq_ttime ttime;
	292	struct cfqg_stats stats; /* stats for this cfqg */
	293	struct cfqg_stats dead_stats; /* stats pushed from dead children */
	294	};
	295
	296	struct cfq_io_cq {
	297	struct io_cq icq; /* must be the first member */
	298	struct cfq_queue *cfqq[2];
	299	struct cfq_ttime ttime;
	300	int ioprio; /* the current ioprio */
	301	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	302	uint64_t blkcg_id; /* the current blkcg ID */
	303	#endif
	304	};
	305
	306	/*
	307	* Per block device queue structure
	308	*/
	309	struct cfq_data {
	310	struct request_queue *queue;
	311	/* Root service tree for cfq_groups */
	312	struct cfq_rb_root grp_service_tree;
	313	struct cfq_group *root_group;
	314
	315	/*
	316	* The priority currently being served
	317	*/
	318	enum wl_class_t serving_wl_class;
	319	enum wl_type_t serving_wl_type;
	320	unsigned long workload_expires;
	321	struct cfq_group *serving_group;
	322
	323	/*
	324	* Each priority tree is sorted by next_request position. These
	325	* trees are used when determining if two or more queues are
	326	* interleaving requests (see cfq_close_cooperator).
	327	*/
	328	struct rb_root prio_trees[CFQ_PRIO_LISTS];
	329
	330	unsigned int busy_queues;
	331	unsigned int busy_sync_queues;
	332
	333	int rq_in_driver;
	334	int rq_in_flight[2];
	335
	336	/*
	337	* queue-depth detection
	338	*/
	339	int rq_queued;
	340	int hw_tag;
	341	/*
	342	* hw_tag can be
	343	* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	344	* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	345	* 0 => no NCQ
	346	*/
	347	int hw_tag_est_depth;
	348	unsigned int hw_tag_samples;
	349
	350	/*
	351	* idle window management
	352	*/
	353	struct timer_list idle_slice_timer;
	354	struct work_struct unplug_work;
	355
	356	struct cfq_queue *active_queue;
	357	struct cfq_io_cq *active_cic;
	358
	359	/*
	360	* async queue for each priority case
	361	*/
	362	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	363	struct cfq_queue *async_idle_cfqq;
	364
	365	sector_t last_position;
	366
	367	/*
	368	* tunables, see top of file
	369	*/
	370	unsigned int cfq_quantum;
	371	unsigned int cfq_fifo_expire[2];
	372	unsigned int cfq_back_penalty;
	373	unsigned int cfq_back_max;
	374	unsigned int cfq_slice[2];
	375	unsigned int cfq_slice_async_rq;
	376	unsigned int cfq_slice_idle;
	377	unsigned int cfq_group_idle;
	378	unsigned int cfq_latency;
	379	unsigned int cfq_target_latency;
	380
	381	/*
	382	* Fallback dummy cfqq for extreme OOM conditions
	383	*/
	384	struct cfq_queue oom_cfqq;
	385
	386	unsigned long last_delayed_sync;
	387	};
	388
	389	static struct cfq_group cfq_get_next_cfqg(struct cfq_data cfqd);
	390
	391	static struct cfq_rb_root st_for(struct cfq_group cfqg,
	392	enum wl_class_t class,
	393	enum wl_type_t type)
	394	{
	395	if (!cfqg)
	396	return NULL;
	397
	398	if (class == IDLE_WORKLOAD)
	399	return &cfqg->service_tree_idle;
	400
	401	return &cfqg->service_trees[class][type];
	402	}
	403
	404	enum cfqq_state_flags {
	405	CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
	406	CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
	407	CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
	408	CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
	409	CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
	410	CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
	411	CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
	412	CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
	413	CFQ_CFQQ_FLAG_sync, /* synchronous queue */
	414	CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
	415	CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
	416	CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
	417	CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
	418	};
	419
	420	#define CFQ_CFQQ_FNS(name) \
	421	static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
	422	{ \
	423	(cfqq)->flags \|= (1 << CFQ_CFQQ_FLAG_##name); \
	424	} \
	425	static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
	426	{ \
	427	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
	428	} \
	429	static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
	430	{ \
	431	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
	432	}
	433
	434	CFQ_CFQQ_FNS(on_rr);
	435	CFQ_CFQQ_FNS(wait_request);
	436	CFQ_CFQQ_FNS(must_dispatch);
	437	CFQ_CFQQ_FNS(must_alloc_slice);
	438	CFQ_CFQQ_FNS(fifo_expire);
	439	CFQ_CFQQ_FNS(idle_window);
	440	CFQ_CFQQ_FNS(prio_changed);
	441	CFQ_CFQQ_FNS(slice_new);
	442	CFQ_CFQQ_FNS(sync);
	443	CFQ_CFQQ_FNS(coop);
	444	CFQ_CFQQ_FNS(split_coop);
	445	CFQ_CFQQ_FNS(deep);
	446	CFQ_CFQQ_FNS(wait_busy);
	447	#undef CFQ_CFQQ_FNS
	448
	449	static inline struct cfq_group pd_to_cfqg(struct blkg_policy_data pd)
	450	{
	451	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
	452	}
	453
	454	static inline struct blkcg_gq cfqg_to_blkg(struct cfq_group cfqg)
	455	{
	456	return pd_to_blkg(&cfqg->pd);
	457	}
	458
	459	#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
	460
	461	/* cfqg stats flags */
	462	enum cfqg_stats_flags {
	463	CFQG_stats_waiting = 0,
	464	CFQG_stats_idling,
	465	CFQG_stats_empty,
	466	};
	467
	468	#define CFQG_FLAG_FNS(name) \
	469	static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
	470	{ \
	471	stats->flags \|= (1 << CFQG_stats_##name); \
	472	} \
	473	static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
	474	{ \
	475	stats->flags &= ~(1 << CFQG_stats_##name); \
	476	} \
	477	static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
	478	{ \
	479	return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
	480	} \
	481
	482	CFQG_FLAG_FNS(waiting)
	483	CFQG_FLAG_FNS(idling)
	484	CFQG_FLAG_FNS(empty)
	485	#undef CFQG_FLAG_FNS
	486
	487	/* This should be called with the queue_lock held. */
	488	static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
	489	{
	490	unsigned long long now;
	491
	492	if (!cfqg_stats_waiting(stats))
	493	return;
	494
	495	now = sched_clock();
	496	if (time_after64(now, stats->start_group_wait_time))
	497	blkg_stat_add(&stats->group_wait_time,
	498	now - stats->start_group_wait_time);
	499	cfqg_stats_clear_waiting(stats);
	500	}
	501
	502	/* This should be called with the queue_lock held. */
	503	static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
	504	struct cfq_group *curr_cfqg)
	505	{
	506	struct cfqg_stats *stats = &cfqg->stats;
	507
	508	if (cfqg_stats_waiting(stats))
	509	return;
	510	if (cfqg == curr_cfqg)
	511	return;
	512	stats->start_group_wait_time = sched_clock();
	513	cfqg_stats_mark_waiting(stats);
	514	}
	515
	516	/* This should be called with the queue_lock held. */
	517	static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
	518	{
	519	unsigned long long now;
	520
	521	if (!cfqg_stats_empty(stats))
	522	return;
	523
	524	now = sched_clock();
	525	if (time_after64(now, stats->start_empty_time))
	526	blkg_stat_add(&stats->empty_time,
	527	now - stats->start_empty_time);
	528	cfqg_stats_clear_empty(stats);
	529	}
	530
	531	static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
	532	{
	533	blkg_stat_add(&cfqg->stats.dequeue, 1);
	534	}
	535
	536	static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
	537	{
	538	struct cfqg_stats *stats = &cfqg->stats;
	539
	540	if (blkg_rwstat_total(&stats->queued))
	541	return;
	542
	543	/*
	544	* group is already marked empty. This can happen if cfqq got new
	545	* request in parent group and moved to this group while being added
	546	* to service tree. Just ignore the event and move on.
	547	*/
	548	if (cfqg_stats_empty(stats))
	549	return;
	550
	551	stats->start_empty_time = sched_clock();
	552	cfqg_stats_mark_empty(stats);
	553	}
	554
	555	static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
	556	{
	557	struct cfqg_stats *stats = &cfqg->stats;
	558
	559	if (cfqg_stats_idling(stats)) {
	560	unsigned long long now = sched_clock();
	561
	562	if (time_after64(now, stats->start_idle_time))
	563	blkg_stat_add(&stats->idle_time,
	564	now - stats->start_idle_time);
	565	cfqg_stats_clear_idling(stats);
	566	}
	567	}
	568
	569	static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
	570	{
	571	struct cfqg_stats *stats = &cfqg->stats;
	572
	573	BUG_ON(cfqg_stats_idling(stats));
	574
	575	stats->start_idle_time = sched_clock();
	576	cfqg_stats_mark_idling(stats);
	577	}
	578
	579	static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
	580	{
	581	struct cfqg_stats *stats = &cfqg->stats;
	582
	583	blkg_stat_add(&stats->avg_queue_size_sum,
	584	blkg_rwstat_total(&stats->queued));
	585	blkg_stat_add(&stats->avg_queue_size_samples, 1);
	586	cfqg_stats_update_group_wait_time(stats);
	587	}
	588
	589	#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
	590
	591	static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group cfqg, struct cfq_group curr_cfqg) { }
	592	static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
	593	static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
	594	static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
	595	static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
	596	static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
	597	static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
	598
	599	#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
	600
	601	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	602
	603	static struct blkcg_policy blkcg_policy_cfq;
	604
	605	static inline struct cfq_group blkg_to_cfqg(struct blkcg_gq blkg)
	606	{
	607	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
	608	}
	609
	610	static inline struct cfq_group cfqg_parent(struct cfq_group cfqg)
	611	{
	612	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
	613
	614	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
	615	}
	616
	617	static inline void cfqg_get(struct cfq_group *cfqg)
	618	{
	619	return blkg_get(cfqg_to_blkg(cfqg));
	620	}
	621
	622	static inline void cfqg_put(struct cfq_group *cfqg)
	623	{
	624	return blkg_put(cfqg_to_blkg(cfqg));
	625	}
	626
	627	#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
	628	char __pbuf[128]; \
	629	\
	630	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
	631	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
	632	cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
	633	cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
	634	__pbuf, ##args); \
	635	} while (0)
	636
	637	#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
	638	char __pbuf[128]; \
	639	\
	640	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \
	641	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \
	642	} while (0)
	643
	644	static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
	645	struct cfq_group *curr_cfqg, int rw)
	646	{
	647	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
	648	cfqg_stats_end_empty_time(&cfqg->stats);
	649	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
	650	}
	651
	652	static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
	653	unsigned long time, unsigned long unaccounted_time)
	654	{
	655	blkg_stat_add(&cfqg->stats.time, time);
	656	#ifdef CONFIG_DEBUG_BLK_CGROUP
	657	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
	658	#endif
	659	}
	660
	661	static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
	662	{
	663	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
	664	}
	665
	666	static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
	667	{
	668	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
	669	}
	670
	671	static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
	672	uint64_t bytes, int rw)
	673	{
	674	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
	675	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
	676	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
	677	}
	678
	679	static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
	680	uint64_t start_time, uint64_t io_start_time, int rw)
	681	{
	682	struct cfqg_stats *stats = &cfqg->stats;
	683	unsigned long long now = sched_clock();
	684
	685	if (time_after64(now, io_start_time))
	686	blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
	687	if (time_after64(io_start_time, start_time))
	688	blkg_rwstat_add(&stats->wait_time, rw,
	689	io_start_time - start_time);
	690	}
	691
	692	/* @stats = 0 */
	693	static void cfqg_stats_reset(struct cfqg_stats *stats)
	694	{
	695	/* queued stats shouldn't be cleared */
	696	blkg_rwstat_reset(&stats->service_bytes);
	697	blkg_rwstat_reset(&stats->serviced);
	698	blkg_rwstat_reset(&stats->merged);
	699	blkg_rwstat_reset(&stats->service_time);
	700	blkg_rwstat_reset(&stats->wait_time);
	701	blkg_stat_reset(&stats->time);
	702	#ifdef CONFIG_DEBUG_BLK_CGROUP
	703	blkg_stat_reset(&stats->unaccounted_time);
	704	blkg_stat_reset(&stats->avg_queue_size_sum);
	705	blkg_stat_reset(&stats->avg_queue_size_samples);
	706	blkg_stat_reset(&stats->dequeue);
	707	blkg_stat_reset(&stats->group_wait_time);
	708	blkg_stat_reset(&stats->idle_time);
	709	blkg_stat_reset(&stats->empty_time);
	710	#endif
	711	}
	712
	713	/* @to += @from */
	714	static void cfqg_stats_merge(struct cfqg_stats to, struct cfqg_stats from)
	715	{
	716	/* queued stats shouldn't be cleared */
	717	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
	718	blkg_rwstat_merge(&to->serviced, &from->serviced);
	719	blkg_rwstat_merge(&to->merged, &from->merged);
	720	blkg_rwstat_merge(&to->service_time, &from->service_time);
	721	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
	722	blkg_stat_merge(&from->time, &from->time);
	723	#ifdef CONFIG_DEBUG_BLK_CGROUP
	724	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
	725	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
	726	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
	727	blkg_stat_merge(&to->dequeue, &from->dequeue);
	728	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
	729	blkg_stat_merge(&to->idle_time, &from->idle_time);
	730	blkg_stat_merge(&to->empty_time, &from->empty_time);
	731	#endif
	732	}
	733
	734	/*
	735	* Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
	736	* recursive stats can still account for the amount used by this cfqg after
	737	* it's gone.
	738	*/
	739	static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
	740	{
	741	struct cfq_group *parent = cfqg_parent(cfqg);
	742
	743	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
	744
	745	if (unlikely(!parent))
	746	return;
	747
	748	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
	749	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
	750	cfqg_stats_reset(&cfqg->stats);
	751	cfqg_stats_reset(&cfqg->dead_stats);
	752	}
	753
	754	#else /* CONFIG_CFQ_GROUP_IOSCHED */
	755
	756	static inline struct cfq_group cfqg_parent(struct cfq_group cfqg) { return NULL; }
	757	static inline void cfqg_get(struct cfq_group *cfqg) { }
	758	static inline void cfqg_put(struct cfq_group *cfqg) { }
	759
	760	#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
	761	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
	762	cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
	763	cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
	764	##args)
	765	#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
	766
	767	static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
	768	struct cfq_group *curr_cfqg, int rw) { }
	769	static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
	770	unsigned long time, unsigned long unaccounted_time) { }
	771	static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
	772	static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
	773	static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
	774	uint64_t bytes, int rw) { }
	775	static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
	776	uint64_t start_time, uint64_t io_start_time, int rw) { }
	777
	778	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
	779
	780	#define cfq_log(cfqd, fmt, args...) \
	781	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
	782
	783	/* Traverses through cfq group service trees */
	784	#define for_each_cfqg_st(cfqg, i, j, st) \
	785	for (i = 0; i <= IDLE_WORKLOAD; i++) \
	786	for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
	787	: &cfqg->service_tree_idle; \
	788	(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) \|\| \
	789	(i == IDLE_WORKLOAD && j == 0); \
	790	j++, st = i < IDLE_WORKLOAD ? \
	791	&cfqg->service_trees[i][j]: NULL) \
	792
	793	static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
	794	struct cfq_ttime *ttime, bool group_idle)
	795	{
	796	unsigned long slice;
	797	if (!sample_valid(ttime->ttime_samples))
	798	return false;
	799	if (group_idle)
	800	slice = cfqd->cfq_group_idle;
	801	else
	802	slice = cfqd->cfq_slice_idle;
	803	return ttime->ttime_mean > slice;
	804	}
	805
	806	static inline bool iops_mode(struct cfq_data *cfqd)
	807	{
	808	/*
	809	* If we are not idling on queues and it is a NCQ drive, parallel
	810	* execution of requests is on and measuring time is not possible
	811	* in most of the cases until and unless we drive shallower queue
	812	* depths and that becomes a performance bottleneck. In such cases
	813	* switch to start providing fairness in terms of number of IOs.
	814	*/
	815	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
	816	return true;
	817	else
	818	return false;
	819	}
	820
	821	static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
	822	{
	823	if (cfq_class_idle(cfqq))
	824	return IDLE_WORKLOAD;
	825	if (cfq_class_rt(cfqq))
	826	return RT_WORKLOAD;
	827	return BE_WORKLOAD;
	828	}
	829
	830
	831	static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
	832	{
	833	if (!cfq_cfqq_sync(cfqq))
	834	return ASYNC_WORKLOAD;
	835	if (!cfq_cfqq_idle_window(cfqq))
	836	return SYNC_NOIDLE_WORKLOAD;
	837	return SYNC_WORKLOAD;
	838	}
	839
	840	static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
	841	struct cfq_data *cfqd,
	842	struct cfq_group *cfqg)
	843	{
	844	if (wl_class == IDLE_WORKLOAD)
	845	return cfqg->service_tree_idle.count;
	846
	847	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
	848	cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
	849	cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
	850	}
	851
	852	static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
	853	struct cfq_group *cfqg)
	854	{
	855	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
	856	cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
	857	}
	858
	859	static void cfq_dispatch_insert(struct request_queue , struct request );
	860	static struct cfq_queue cfq_get_queue(struct cfq_data cfqd, bool is_sync,
	861	struct cfq_io_cq cic, struct bio bio,
	862	gfp_t gfp_mask);
	863
	864	static inline struct cfq_io_cq icq_to_cic(struct io_cq icq)
	865	{
	866	/* cic->icq is the first member, %NULL will convert to %NULL */
	867	return container_of(icq, struct cfq_io_cq, icq);
	868	}
	869
	870	static inline struct cfq_io_cq cfq_cic_lookup(struct cfq_data cfqd,
	871	struct io_context *ioc)
	872	{
	873	if (ioc)
	874	return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
	875	return NULL;
	876	}
	877
	878	static inline struct cfq_queue cic_to_cfqq(struct cfq_io_cq cic, bool is_sync)
	879	{
	880	return cic->cfqq[is_sync];
	881	}
	882
	883	static inline void cic_set_cfqq(struct cfq_io_cq cic, struct cfq_queue cfqq,
	884	bool is_sync)
	885	{
	886	cic->cfqq[is_sync] = cfqq;
	887	}
	888
	889	static inline struct cfq_data cic_to_cfqd(struct cfq_io_cq cic)
	890	{
	891	return cic->icq.q->elevator->elevator_data;
	892	}
	893
	894	/*
	895	* We regard a request as SYNC, if it's either a read or has the SYNC bit
	896	* set (in which case it could also be direct WRITE).
	897	*/
	898	static inline bool cfq_bio_sync(struct bio *bio)
	899	{
	900	return bio_data_dir(bio) == READ \|\| (bio->bi_rw & REQ_SYNC);
	901	}
	902
	903	/*
	904	* scheduler run of queue, if there are requests pending and no one in the
	905	* driver that will restart queueing
	906	*/
	907	static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
	908	{
	909	if (cfqd->busy_queues) {
	910	cfq_log(cfqd, "schedule dispatch");
	911	kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
	912	}
	913	}
	914
	915	/*
	916	* Scale schedule slice based on io priority. Use the sync time slice only
	917	* if a queue is marked sync and has sync io queued. A sync queue with async
	918	* io only, should not get full sync slice length.
	919	*/
	920	static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
	921	unsigned short prio)
	922	{
	923	const int base_slice = cfqd->cfq_slice[sync];
	924
	925	WARN_ON(prio >= IOPRIO_BE_NR);
	926
	927	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
	928	}
	929
	930	static inline int
	931	cfq_prio_to_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
	932	{
	933	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
	934	}
	935
	936	/**
	937	* cfqg_scale_charge - scale disk time charge according to cfqg weight
	938	* @charge: disk time being charged
	939	* @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
	940	*
	941	* Scale @charge according to @vfraction, which is in range (0, 1]. The
	942	* scaling is inversely proportional.
	943	*
	944	* scaled = charge / vfraction
	945	*
	946	* The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
	947	*/
	948	static inline u64 cfqg_scale_charge(unsigned long charge,
	949	unsigned int vfraction)
	950	{
	951	u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
	952
	953	/* charge / vfraction */
	954	c <<= CFQ_SERVICE_SHIFT;
	955	do_div(c, vfraction);
	956	return c;
	957	}
	958
	959	static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
	960	{
	961	s64 delta = (s64)(vdisktime - min_vdisktime);
	962	if (delta > 0)
	963	min_vdisktime = vdisktime;
	964
	965	return min_vdisktime;
	966	}
	967
	968	static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
	969	{
	970	s64 delta = (s64)(vdisktime - min_vdisktime);
	971	if (delta < 0)
	972	min_vdisktime = vdisktime;
	973
	974	return min_vdisktime;
	975	}
	976
	977	static void update_min_vdisktime(struct cfq_rb_root *st)
	978	{
	979	struct cfq_group *cfqg;
	980
	981	if (st->left) {
	982	cfqg = rb_entry_cfqg(st->left);
	983	st->min_vdisktime = max_vdisktime(st->min_vdisktime,
	984	cfqg->vdisktime);
	985	}
	986	}
	987
	988	/*
	989	* get averaged number of queues of RT/BE priority.
	990	* average is updated, with a formula that gives more weight to higher numbers,
	991	* to quickly follows sudden increases and decrease slowly
	992	*/
	993
	994	static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
	995	struct cfq_group *cfqg, bool rt)
	996	{
	997	unsigned min_q, max_q;
	998	unsigned mult = cfq_hist_divisor - 1;
	999	unsigned round = cfq_hist_divisor / 2;
	1000	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
	1001
	1002	min_q = min(cfqg->busy_queues_avg[rt], busy);
	1003	max_q = max(cfqg->busy_queues_avg[rt], busy);
	1004	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
	1005	cfq_hist_divisor;
	1006	return cfqg->busy_queues_avg[rt];
	1007	}
	1008
	1009	static inline unsigned
	1010	cfq_group_slice(struct cfq_data cfqd, struct cfq_group cfqg)
	1011	{
	1012	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
	1013	}
	1014
	1015	static inline unsigned
	1016	cfq_scaled_cfqq_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
	1017	{
	1018	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
	1019	if (cfqd->cfq_latency) {
	1020	/*
	1021	* interested queues (we consider only the ones with the same
	1022	* priority class in the cfq group)
	1023	*/
	1024	unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
	1025	cfq_class_rt(cfqq));
	1026	unsigned sync_slice = cfqd->cfq_slice[1];
	1027	unsigned expect_latency = sync_slice * iq;
	1028	unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
	1029
	1030	if (expect_latency > group_slice) {
	1031	unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
	1032	/* scale low_slice according to IO priority
	1033	* and sync vs async */
	1034	unsigned low_slice =
	1035	min(slice, base_low_slice * slice / sync_slice);
	1036	/* the adapted slice value is scaled to fit all iqs
	1037	* into the target latency */
	1038	slice = max(slice * group_slice / expect_latency,
	1039	low_slice);
	1040	}
	1041	}
	1042	return slice;
	1043	}
	1044
	1045	static inline void
	1046	cfq_set_prio_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
	1047	{
	1048	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
	1049
	1050	cfqq->slice_start = jiffies;
	1051	cfqq->slice_end = jiffies + slice;
	1052	cfqq->allocated_slice = slice;
	1053	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
	1054	}
	1055
	1056	/*
	1057	* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
	1058	* isn't valid until the first request from the dispatch is activated
	1059	* and the slice time set.
	1060	*/
	1061	static inline bool cfq_slice_used(struct cfq_queue *cfqq)
	1062	{
	1063	if (cfq_cfqq_slice_new(cfqq))
	1064	return false;
	1065	if (time_before(jiffies, cfqq->slice_end))
	1066	return false;
	1067
	1068	return true;
	1069	}
	1070
	1071	/*
	1072	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
	1073	* We choose the request that is closest to the head right now. Distance
	1074	* behind the head is penalized and only allowed to a certain extent.
	1075	*/
	1076	static struct request *
	1077	cfq_choose_req(struct cfq_data cfqd, struct request rq1, struct request *rq2, sector_t last)
	1078	{
	1079	sector_t s1, s2, d1 = 0, d2 = 0;
	1080	unsigned long back_max;
	1081	#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
	1082	#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
	1083	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
	1084
	1085	if (rq1 == NULL \|\| rq1 == rq2)
	1086	return rq2;
	1087	if (rq2 == NULL)
	1088	return rq1;
	1089
	1090	if (rq_is_sync(rq1) != rq_is_sync(rq2))
	1091	return rq_is_sync(rq1) ? rq1 : rq2;
	1092
	1093	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
	1094	return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
	1095
	1096	s1 = blk_rq_pos(rq1);
	1097	s2 = blk_rq_pos(rq2);
	1098
	1099	/*
	1100	* by definition, 1KiB is 2 sectors
	1101	*/
	1102	back_max = cfqd->cfq_back_max * 2;
	1103
	1104	/*
	1105	* Strict one way elevator _except_ in the case where we allow
	1106	* short backward seeks which are biased as twice the cost of a
	1107	* similar forward seek.
	1108	*/
	1109	if (s1 >= last)
	1110	d1 = s1 - last;
	1111	else if (s1 + back_max >= last)
	1112	d1 = (last - s1) * cfqd->cfq_back_penalty;
	1113	else
	1114	wrap \|= CFQ_RQ1_WRAP;
	1115
	1116	if (s2 >= last)
	1117	d2 = s2 - last;
	1118	else if (s2 + back_max >= last)
	1119	d2 = (last - s2) * cfqd->cfq_back_penalty;
	1120	else
	1121	wrap \|= CFQ_RQ2_WRAP;
	1122
	1123	/* Found required data */
	1124
	1125	/*
	1126	* By doing switch() on the bit mask "wrap" we avoid having to
	1127	* check two variables for all permutations: --> faster!
	1128	*/
	1129	switch (wrap) {
	1130	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
	1131	if (d1 < d2)
	1132	return rq1;
	1133	else if (d2 < d1)
	1134	return rq2;
	1135	else {
	1136	if (s1 >= s2)
	1137	return rq1;
	1138	else
	1139	return rq2;
	1140	}
	1141
	1142	case CFQ_RQ2_WRAP:
	1143	return rq1;
	1144	case CFQ_RQ1_WRAP:
	1145	return rq2;
	1146	case (CFQ_RQ1_WRAP\|CFQ_RQ2_WRAP): /* both rqs wrapped */
	1147	default:
	1148	/*
	1149	* Since both rqs are wrapped,
	1150	* start with the one that's further behind head
	1151	* (--> only one back seek required),
	1152	* since back seek takes more time than forward.
	1153	*/
	1154	if (s1 <= s2)
	1155	return rq1;
	1156	else
	1157	return rq2;
	1158	}
	1159	}
	1160
	1161	/*
	1162	* The below is leftmost cache rbtree addon
	1163	*/
	1164	static struct cfq_queue cfq_rb_first(struct cfq_rb_root root)
	1165	{
	1166	/* Service tree is empty */
	1167	if (!root->count)
	1168	return NULL;
	1169
	1170	if (!root->left)
	1171	root->left = rb_first(&root->rb);
	1172
	1173	if (root->left)
	1174	return rb_entry(root->left, struct cfq_queue, rb_node);
	1175
	1176	return NULL;
	1177	}
	1178
	1179	static struct cfq_group cfq_rb_first_group(struct cfq_rb_root root)
	1180	{
	1181	if (!root->left)
	1182	root->left = rb_first(&root->rb);
	1183
	1184	if (root->left)
	1185	return rb_entry_cfqg(root->left);
	1186
	1187	return NULL;
	1188	}
	1189
	1190	static void rb_erase_init(struct rb_node n, struct rb_root root)
	1191	{
	1192	rb_erase(n, root);
	1193	RB_CLEAR_NODE(n);
	1194	}
	1195
	1196	static void cfq_rb_erase(struct rb_node n, struct cfq_rb_root root)
	1197	{
	1198	if (root->left == n)
	1199	root->left = NULL;
	1200	rb_erase_init(n, &root->rb);
	1201	--root->count;
	1202	}
	1203
	1204	/*
	1205	* would be nice to take fifo expire time into account as well
	1206	*/
	1207	static struct request *
	1208	cfq_find_next_rq(struct cfq_data cfqd, struct cfq_queue cfqq,
	1209	struct request *last)
	1210	{
	1211	struct rb_node *rbnext = rb_next(&last->rb_node);
	1212	struct rb_node *rbprev = rb_prev(&last->rb_node);
	1213	struct request next = NULL, prev = NULL;
	1214
	1215	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
	1216
	1217	if (rbprev)
	1218	prev = rb_entry_rq(rbprev);
	1219
	1220	if (rbnext)
	1221	next = rb_entry_rq(rbnext);
	1222	else {
	1223	rbnext = rb_first(&cfqq->sort_list);
	1224	if (rbnext && rbnext != &last->rb_node)
	1225	next = rb_entry_rq(rbnext);
	1226	}
	1227
	1228	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
	1229	}
	1230
	1231	static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
	1232	struct cfq_queue *cfqq)
	1233	{
	1234	/*
	1235	* just an approximation, should be ok.
	1236	*/
	1237	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
	1238	cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
	1239	}
	1240
	1241	static inline s64
	1242	cfqg_key(struct cfq_rb_root st, struct cfq_group cfqg)
	1243	{
	1244	return cfqg->vdisktime - st->min_vdisktime;
	1245	}
	1246
	1247	static void
	1248	__cfq_group_service_tree_add(struct cfq_rb_root st, struct cfq_group cfqg)
	1249	{
	1250	struct rb_node **node = &st->rb.rb_node;
	1251	struct rb_node *parent = NULL;
	1252	struct cfq_group *__cfqg;
	1253	s64 key = cfqg_key(st, cfqg);
	1254	int left = 1;
	1255
	1256	while (*node != NULL) {
	1257	parent = *node;
	1258	__cfqg = rb_entry_cfqg(parent);
	1259
	1260	if (key < cfqg_key(st, __cfqg))
	1261	node = &parent->rb_left;
	1262	else {
	1263	node = &parent->rb_right;
	1264	left = 0;
	1265	}
	1266	}
	1267
	1268	if (left)
	1269	st->left = &cfqg->rb_node;
	1270
	1271	rb_link_node(&cfqg->rb_node, parent, node);
	1272	rb_insert_color(&cfqg->rb_node, &st->rb);
	1273	}
	1274
	1275	static void
	1276	cfq_update_group_weight(struct cfq_group *cfqg)
	1277	{
	1278	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
	1279
	1280	if (cfqg->new_weight) {
	1281	cfqg->weight = cfqg->new_weight;
	1282	cfqg->new_weight = 0;
	1283	}
	1284
	1285	if (cfqg->new_leaf_weight) {
	1286	cfqg->leaf_weight = cfqg->new_leaf_weight;
	1287	cfqg->new_leaf_weight = 0;
	1288	}
	1289	}
	1290
	1291	static void
	1292	cfq_group_service_tree_add(struct cfq_rb_root st, struct cfq_group cfqg)
	1293	{
	1294	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
	1295	struct cfq_group *pos = cfqg;
	1296	struct cfq_group *parent;
	1297	bool propagate;
	1298
	1299	/* add to the service tree */
	1300	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
	1301
	1302	cfq_update_group_weight(cfqg);
	1303	__cfq_group_service_tree_add(st, cfqg);
	1304
	1305	/*
	1306	* Activate @cfqg and calculate the portion of vfraction @cfqg is
	1307	* entitled to. vfraction is calculated by walking the tree
	1308	* towards the root calculating the fraction it has at each level.
	1309	* The compounded ratio is how much vfraction @cfqg owns.
	1310	*
	1311	* Start with the proportion tasks in this cfqg has against active
	1312	* children cfqgs - its leaf_weight against children_weight.
	1313	*/
	1314	propagate = !pos->nr_active++;
	1315	pos->children_weight += pos->leaf_weight;
	1316	vfr = vfr * pos->leaf_weight / pos->children_weight;
	1317
	1318	/*
	1319	* Compound ->weight walking up the tree. Both activation and
	1320	* vfraction calculation are done in the same loop. Propagation
	1321	* stops once an already activated node is met. vfraction
	1322	* calculation should always continue to the root.
	1323	*/
	1324	while ((parent = cfqg_parent(pos))) {
	1325	if (propagate) {
	1326	propagate = !parent->nr_active++;
	1327	parent->children_weight += pos->weight;
	1328	}
	1329	vfr = vfr * pos->weight / parent->children_weight;
	1330	pos = parent;
	1331	}
	1332
	1333	cfqg->vfraction = max_t(unsigned, vfr, 1);
	1334	}
	1335
	1336	static void
	1337	cfq_group_notify_queue_add(struct cfq_data cfqd, struct cfq_group cfqg)
	1338	{
	1339	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	1340	struct cfq_group *__cfqg;
	1341	struct rb_node *n;
	1342
	1343	cfqg->nr_cfqq++;
	1344	if (!RB_EMPTY_NODE(&cfqg->rb_node))
	1345	return;
	1346
	1347	/*
	1348	* Currently put the group at the end. Later implement something
	1349	* so that groups get lesser vtime based on their weights, so that
	1350	* if group does not loose all if it was not continuously backlogged.
	1351	*/
	1352	n = rb_last(&st->rb);
	1353	if (n) {
	1354	__cfqg = rb_entry_cfqg(n);
	1355	cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
	1356	} else
	1357	cfqg->vdisktime = st->min_vdisktime;
	1358	cfq_group_service_tree_add(st, cfqg);
	1359	}
	1360
	1361	static void
	1362	cfq_group_service_tree_del(struct cfq_rb_root st, struct cfq_group cfqg)
	1363	{
	1364	struct cfq_group *pos = cfqg;
	1365	bool propagate;
	1366
	1367	/*
	1368	* Undo activation from cfq_group_service_tree_add(). Deactivate
	1369	* @cfqg and propagate deactivation upwards.
	1370	*/
	1371	propagate = !--pos->nr_active;
	1372	pos->children_weight -= pos->leaf_weight;
	1373
	1374	while (propagate) {
	1375	struct cfq_group *parent = cfqg_parent(pos);
	1376
	1377	/* @pos has 0 nr_active at this point */
	1378	WARN_ON_ONCE(pos->children_weight);
	1379	pos->vfraction = 0;
	1380
	1381	if (!parent)
	1382	break;
	1383
	1384	propagate = !--parent->nr_active;
	1385	parent->children_weight -= pos->weight;
	1386	pos = parent;
	1387	}
	1388
	1389	/* remove from the service tree */
	1390	if (!RB_EMPTY_NODE(&cfqg->rb_node))
	1391	cfq_rb_erase(&cfqg->rb_node, st);
	1392	}
	1393
	1394	static void
	1395	cfq_group_notify_queue_del(struct cfq_data cfqd, struct cfq_group cfqg)
	1396	{
	1397	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	1398
	1399	BUG_ON(cfqg->nr_cfqq < 1);
	1400	cfqg->nr_cfqq--;
	1401
	1402	/* If there are other cfq queues under this group, don't delete it */
	1403	if (cfqg->nr_cfqq)
	1404	return;
	1405
	1406	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
	1407	cfq_group_service_tree_del(st, cfqg);
	1408	cfqg->saved_wl_slice = 0;
	1409	cfqg_stats_update_dequeue(cfqg);
	1410	}
	1411
	1412	static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
	1413	unsigned int *unaccounted_time)
	1414	{
	1415	unsigned int slice_used;
	1416
	1417	/*
	1418	* Queue got expired before even a single request completed or
	1419	* got expired immediately after first request completion.
	1420	*/
	1421	if (!cfqq->slice_start \|\| cfqq->slice_start == jiffies) {
	1422	/*
	1423	* Also charge the seek time incurred to the group, otherwise
	1424	* if there are mutiple queues in the group, each can dispatch
	1425	* a single request on seeky media and cause lots of seek time
	1426	* and group will never know it.
	1427	*/
	1428	slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
	1429	1);
	1430	} else {
	1431	slice_used = jiffies - cfqq->slice_start;
	1432	if (slice_used > cfqq->allocated_slice) {
	1433	*unaccounted_time = slice_used - cfqq->allocated_slice;
	1434	slice_used = cfqq->allocated_slice;
	1435	}
	1436	if (time_after(cfqq->slice_start, cfqq->dispatch_start))
	1437	*unaccounted_time += cfqq->slice_start -
	1438	cfqq->dispatch_start;
	1439	}
	1440
	1441	return slice_used;
	1442	}
	1443
	1444	static void cfq_group_served(struct cfq_data cfqd, struct cfq_group cfqg,
	1445	struct cfq_queue *cfqq)
	1446	{
	1447	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	1448	unsigned int used_sl, charge, unaccounted_sl = 0;
	1449	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
	1450	- cfqg->service_tree_idle.count;
	1451	unsigned int vfr;
	1452
	1453	BUG_ON(nr_sync < 0);
	1454	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
	1455
	1456	if (iops_mode(cfqd))
	1457	charge = cfqq->slice_dispatch;
	1458	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
	1459	charge = cfqq->allocated_slice;
	1460
	1461	/*
	1462	* Can't update vdisktime while on service tree and cfqg->vfraction
	1463	* is valid only while on it. Cache vfr, leave the service tree,
	1464	* update vdisktime and go back on. The re-addition to the tree
	1465	* will also update the weights as necessary.
	1466	*/
	1467	vfr = cfqg->vfraction;
	1468	cfq_group_service_tree_del(st, cfqg);
	1469	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
	1470	cfq_group_service_tree_add(st, cfqg);
	1471
	1472	/* This group is being expired. Save the context */
	1473	if (time_after(cfqd->workload_expires, jiffies)) {
	1474	cfqg->saved_wl_slice = cfqd->workload_expires
	1475	- jiffies;
	1476	cfqg->saved_wl_type = cfqd->serving_wl_type;
	1477	cfqg->saved_wl_class = cfqd->serving_wl_class;
	1478	} else
	1479	cfqg->saved_wl_slice = 0;
	1480
	1481	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
	1482	st->min_vdisktime);
	1483	cfq_log_cfqq(cfqq->cfqd, cfqq,
	1484	"sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
	1485	used_sl, cfqq->slice_dispatch, charge,
	1486	iops_mode(cfqd), cfqq->nr_sectors);
	1487	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
	1488	cfqg_stats_set_start_empty_time(cfqg);
	1489	}
	1490
	1491	/**
	1492	* cfq_init_cfqg_base - initialize base part of a cfq_group
	1493	* @cfqg: cfq_group to initialize
	1494	*
	1495	* Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
	1496	* is enabled or not.
	1497	*/
	1498	static void cfq_init_cfqg_base(struct cfq_group *cfqg)
	1499	{
	1500	struct cfq_rb_root *st;
	1501	int i, j;
	1502
	1503	for_each_cfqg_st(cfqg, i, j, st)
	1504	*st = CFQ_RB_ROOT;
	1505	RB_CLEAR_NODE(&cfqg->rb_node);
	1506
	1507	cfqg->ttime.last_end_request = jiffies;
	1508	}
	1509
	1510	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	1511	static void cfqg_stats_init(struct cfqg_stats *stats)
	1512	{
	1513	blkg_rwstat_init(&stats->service_bytes);
	1514	blkg_rwstat_init(&stats->serviced);
	1515	blkg_rwstat_init(&stats->merged);
	1516	blkg_rwstat_init(&stats->service_time);
	1517	blkg_rwstat_init(&stats->wait_time);
	1518	blkg_rwstat_init(&stats->queued);
	1519
	1520	blkg_stat_init(&stats->sectors);
	1521	blkg_stat_init(&stats->time);
	1522
	1523	#ifdef CONFIG_DEBUG_BLK_CGROUP
	1524	blkg_stat_init(&stats->unaccounted_time);
	1525	blkg_stat_init(&stats->avg_queue_size_sum);
	1526	blkg_stat_init(&stats->avg_queue_size_samples);
	1527	blkg_stat_init(&stats->dequeue);
	1528	blkg_stat_init(&stats->group_wait_time);
	1529	blkg_stat_init(&stats->idle_time);
	1530	blkg_stat_init(&stats->empty_time);
	1531	#endif
	1532	}
	1533
	1534	static void cfq_pd_init(struct blkcg_gq *blkg)
	1535	{
	1536	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
	1537
	1538	cfq_init_cfqg_base(cfqg);
	1539	cfqg->weight = blkg->blkcg->cfq_weight;
	1540	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
	1541	cfqg_stats_init(&cfqg->stats);
	1542	cfqg_stats_init(&cfqg->dead_stats);
	1543	}
	1544
	1545	static void cfq_pd_offline(struct blkcg_gq *blkg)
	1546	{
	1547	/*
	1548	* @blkg is going offline and will be ignored by
	1549	* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
	1550	* that they don't get lost. If IOs complete after this point, the
	1551	* stats for them will be lost. Oh well...
	1552	*/
	1553	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
	1554	}
	1555
	1556	/* offset delta from cfqg->stats to cfqg->dead_stats */
	1557	static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
	1558	offsetof(struct cfq_group, stats);
	1559
	1560	/* to be used by recursive prfill, sums live and dead stats recursively */
	1561	static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
	1562	{
	1563	u64 sum = 0;
	1564
	1565	sum += blkg_stat_recursive_sum(pd, off);
	1566	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
	1567	return sum;
	1568	}
	1569
	1570	/* to be used by recursive prfill, sums live and dead rwstats recursively */
	1571	static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
	1572	int off)
	1573	{
	1574	struct blkg_rwstat a, b;
	1575
	1576	a = blkg_rwstat_recursive_sum(pd, off);
	1577	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
	1578	blkg_rwstat_merge(&a, &b);
	1579	return a;
	1580	}
	1581
	1582	static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
	1583	{
	1584	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
	1585
	1586	cfqg_stats_reset(&cfqg->stats);
	1587	cfqg_stats_reset(&cfqg->dead_stats);
	1588	}
	1589
	1590	/*
	1591	* Search for the cfq group current task belongs to. request_queue lock must
	1592	* be held.
	1593	*/
	1594	static struct cfq_group cfq_lookup_create_cfqg(struct cfq_data cfqd,
	1595	struct blkcg *blkcg)
	1596	{
	1597	struct request_queue *q = cfqd->queue;
	1598	struct cfq_group *cfqg = NULL;
	1599
	1600	/* avoid lookup for the common case where there's no blkcg */
	1601	if (blkcg == &blkcg_root) {
	1602	cfqg = cfqd->root_group;
	1603	} else {
	1604	struct blkcg_gq *blkg;
	1605
	1606	blkg = blkg_lookup_create(blkcg, q);
	1607	if (!IS_ERR(blkg))
	1608	cfqg = blkg_to_cfqg(blkg);
	1609	}
	1610
	1611	return cfqg;
	1612	}
	1613
	1614	static void cfq_link_cfqq_cfqg(struct cfq_queue cfqq, struct cfq_group cfqg)
	1615	{
	1616	/* Currently, all async queues are mapped to root group */
	1617	if (!cfq_cfqq_sync(cfqq))
	1618	cfqg = cfqq->cfqd->root_group;
	1619
	1620	cfqq->cfqg = cfqg;
	1621	/* cfqq reference on cfqg */
	1622	cfqg_get(cfqg);
	1623	}
	1624
	1625	static u64 cfqg_prfill_weight_device(struct seq_file *sf,
	1626	struct blkg_policy_data *pd, int off)
	1627	{
	1628	struct cfq_group *cfqg = pd_to_cfqg(pd);
	1629
	1630	if (!cfqg->dev_weight)
	1631	return 0;
	1632	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
	1633	}
	1634
	1635	static int cfqg_print_weight_device(struct seq_file sf, void v)
	1636	{
	1637	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
	1638	cfqg_prfill_weight_device, &blkcg_policy_cfq,
	1639	0, false);
	1640	return 0;
	1641	}
	1642
	1643	static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
	1644	struct blkg_policy_data *pd, int off)
	1645	{
	1646	struct cfq_group *cfqg = pd_to_cfqg(pd);
	1647
	1648	if (!cfqg->dev_leaf_weight)
	1649	return 0;
	1650	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
	1651	}
	1652
	1653	static int cfqg_print_leaf_weight_device(struct seq_file sf, void v)
	1654	{
	1655	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
	1656	cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
	1657	0, false);
	1658	return 0;
	1659	}
	1660
	1661	static int cfq_print_weight(struct seq_file sf, void v)
	1662	{
	1663	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
	1664	return 0;
	1665	}
	1666
	1667	static int cfq_print_leaf_weight(struct seq_file sf, void v)
	1668	{
	1669	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
	1670	return 0;
	1671	}
	1672
	1673	static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
	1674	struct cftype cft, const char buf,
	1675	bool is_leaf_weight)
	1676	{
	1677	struct blkcg *blkcg = css_to_blkcg(css);
	1678	struct blkg_conf_ctx ctx;
	1679	struct cfq_group *cfqg;
	1680	int ret;
	1681
	1682	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
	1683	if (ret)
	1684	return ret;
	1685
	1686	ret = -EINVAL;
	1687	cfqg = blkg_to_cfqg(ctx.blkg);
	1688	if (!ctx.v \|\| (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
	1689	if (!is_leaf_weight) {
	1690	cfqg->dev_weight = ctx.v;
	1691	cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
	1692	} else {
	1693	cfqg->dev_leaf_weight = ctx.v;
	1694	cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
	1695	}
	1696	ret = 0;
	1697	}
	1698
	1699	blkg_conf_finish(&ctx);
	1700	return ret;
	1701	}
	1702
	1703	static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
	1704	struct cftype cft, const char buf)
	1705	{
	1706	return __cfqg_set_weight_device(css, cft, buf, false);
	1707	}
	1708
	1709	static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
	1710	struct cftype cft, const char buf)
	1711	{
	1712	return __cfqg_set_weight_device(css, cft, buf, true);
	1713	}
	1714
	1715	static int __cfq_set_weight(struct cgroup_subsys_state css, struct cftype cft,
	1716	u64 val, bool is_leaf_weight)
	1717	{
	1718	struct blkcg *blkcg = css_to_blkcg(css);
	1719	struct blkcg_gq *blkg;
	1720
	1721	if (val < CFQ_WEIGHT_MIN \|\| val > CFQ_WEIGHT_MAX)
	1722	return -EINVAL;
	1723
	1724	spin_lock_irq(&blkcg->lock);
	1725
	1726	if (!is_leaf_weight)
	1727	blkcg->cfq_weight = val;
	1728	else
	1729	blkcg->cfq_leaf_weight = val;
	1730
	1731	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
	1732	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
	1733
	1734	if (!cfqg)
	1735	continue;
	1736
	1737	if (!is_leaf_weight) {
	1738	if (!cfqg->dev_weight)
	1739	cfqg->new_weight = blkcg->cfq_weight;
	1740	} else {
	1741	if (!cfqg->dev_leaf_weight)
	1742	cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
	1743	}
	1744	}
	1745
	1746	spin_unlock_irq(&blkcg->lock);
	1747	return 0;
	1748	}
	1749
	1750	static int cfq_set_weight(struct cgroup_subsys_state css, struct cftype cft,
	1751	u64 val)
	1752	{
	1753	return __cfq_set_weight(css, cft, val, false);
	1754	}
	1755
	1756	static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
	1757	struct cftype *cft, u64 val)
	1758	{
	1759	return __cfq_set_weight(css, cft, val, true);
	1760	}
	1761
	1762	static int cfqg_print_stat(struct seq_file sf, void v)
	1763	{
	1764	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
	1765	&blkcg_policy_cfq, seq_cft(sf)->private, false);
	1766	return 0;
	1767	}
	1768
	1769	static int cfqg_print_rwstat(struct seq_file sf, void v)
	1770	{
	1771	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
	1772	&blkcg_policy_cfq, seq_cft(sf)->private, true);
	1773	return 0;
	1774	}
	1775
	1776	static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
	1777	struct blkg_policy_data *pd, int off)
	1778	{
	1779	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
	1780
	1781	return __blkg_prfill_u64(sf, pd, sum);
	1782	}
	1783
	1784	static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
	1785	struct blkg_policy_data *pd, int off)
	1786	{
	1787	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
	1788
	1789	return __blkg_prfill_rwstat(sf, pd, &sum);
	1790	}
	1791
	1792	static int cfqg_print_stat_recursive(struct seq_file sf, void v)
	1793	{
	1794	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
	1795	cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
	1796	seq_cft(sf)->private, false);
	1797	return 0;
	1798	}
	1799
	1800	static int cfqg_print_rwstat_recursive(struct seq_file sf, void v)
	1801	{
	1802	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
	1803	cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
	1804	seq_cft(sf)->private, true);
	1805	return 0;
	1806	}
	1807
	1808	#ifdef CONFIG_DEBUG_BLK_CGROUP
	1809	static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
	1810	struct blkg_policy_data *pd, int off)
	1811	{
	1812	struct cfq_group *cfqg = pd_to_cfqg(pd);
	1813	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
	1814	u64 v = 0;
	1815
	1816	if (samples) {
	1817	v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
	1818	v = div64_u64(v, samples);
	1819	}
	1820	__blkg_prfill_u64(sf, pd, v);
	1821	return 0;
	1822	}
	1823
	1824	/* print avg_queue_size */
	1825	static int cfqg_print_avg_queue_size(struct seq_file sf, void v)
	1826	{
	1827	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
	1828	cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
	1829	0, false);
	1830	return 0;
	1831	}
	1832	#endif /* CONFIG_DEBUG_BLK_CGROUP */
	1833
	1834	static struct cftype cfq_blkcg_files[] = {
	1835	/* on root, weight is mapped to leaf_weight */
	1836	{
	1837	.name = "weight_device",
	1838	.flags = CFTYPE_ONLY_ON_ROOT,
	1839	.seq_show = cfqg_print_leaf_weight_device,
	1840	.write_string = cfqg_set_leaf_weight_device,
	1841	.max_write_len = 256,
	1842	},
	1843	{
	1844	.name = "weight",
	1845	.flags = CFTYPE_ONLY_ON_ROOT,
	1846	.seq_show = cfq_print_leaf_weight,
	1847	.write_u64 = cfq_set_leaf_weight,
	1848	},
	1849
	1850	/* no such mapping necessary for !roots */
	1851	{
	1852	.name = "weight_device",
	1853	.flags = CFTYPE_NOT_ON_ROOT,
	1854	.seq_show = cfqg_print_weight_device,
	1855	.write_string = cfqg_set_weight_device,
	1856	.max_write_len = 256,
	1857	},
	1858	{
	1859	.name = "weight",
	1860	.flags = CFTYPE_NOT_ON_ROOT,
	1861	.seq_show = cfq_print_weight,
	1862	.write_u64 = cfq_set_weight,
	1863	},
	1864
	1865	{
	1866	.name = "leaf_weight_device",
	1867	.seq_show = cfqg_print_leaf_weight_device,
	1868	.write_string = cfqg_set_leaf_weight_device,
	1869	.max_write_len = 256,
	1870	},
	1871	{
	1872	.name = "leaf_weight",
	1873	.seq_show = cfq_print_leaf_weight,
	1874	.write_u64 = cfq_set_leaf_weight,
	1875	},
	1876
	1877	/* statistics, covers only the tasks in the cfqg */
	1878	{
	1879	.name = "time",
	1880	.private = offsetof(struct cfq_group, stats.time),
	1881	.seq_show = cfqg_print_stat,
	1882	},
	1883	{
	1884	.name = "sectors",
	1885	.private = offsetof(struct cfq_group, stats.sectors),
	1886	.seq_show = cfqg_print_stat,
	1887	},
	1888	{
	1889	.name = "io_service_bytes",
	1890	.private = offsetof(struct cfq_group, stats.service_bytes),
	1891	.seq_show = cfqg_print_rwstat,
	1892	},
	1893	{
	1894	.name = "io_serviced",
	1895	.private = offsetof(struct cfq_group, stats.serviced),
	1896	.seq_show = cfqg_print_rwstat,
	1897	},
	1898	{
	1899	.name = "io_service_time",
	1900	.private = offsetof(struct cfq_group, stats.service_time),
	1901	.seq_show = cfqg_print_rwstat,
	1902	},
	1903	{
	1904	.name = "io_wait_time",
	1905	.private = offsetof(struct cfq_group, stats.wait_time),
	1906	.seq_show = cfqg_print_rwstat,
	1907	},
	1908	{
	1909	.name = "io_merged",
	1910	.private = offsetof(struct cfq_group, stats.merged),
	1911	.seq_show = cfqg_print_rwstat,
	1912	},
	1913	{
	1914	.name = "io_queued",
	1915	.private = offsetof(struct cfq_group, stats.queued),
	1916	.seq_show = cfqg_print_rwstat,
	1917	},
	1918
	1919	/* the same statictics which cover the cfqg and its descendants */
	1920	{
	1921	.name = "time_recursive",
	1922	.private = offsetof(struct cfq_group, stats.time),
	1923	.seq_show = cfqg_print_stat_recursive,
	1924	},
	1925	{
	1926	.name = "sectors_recursive",
	1927	.private = offsetof(struct cfq_group, stats.sectors),
	1928	.seq_show = cfqg_print_stat_recursive,
	1929	},
	1930	{
	1931	.name = "io_service_bytes_recursive",
	1932	.private = offsetof(struct cfq_group, stats.service_bytes),
	1933	.seq_show = cfqg_print_rwstat_recursive,
	1934	},
	1935	{
	1936	.name = "io_serviced_recursive",
	1937	.private = offsetof(struct cfq_group, stats.serviced),
	1938	.seq_show = cfqg_print_rwstat_recursive,
	1939	},
	1940	{
	1941	.name = "io_service_time_recursive",
	1942	.private = offsetof(struct cfq_group, stats.service_time),
	1943	.seq_show = cfqg_print_rwstat_recursive,
	1944	},
	1945	{
	1946	.name = "io_wait_time_recursive",
	1947	.private = offsetof(struct cfq_group, stats.wait_time),
	1948	.seq_show = cfqg_print_rwstat_recursive,
	1949	},
	1950	{
	1951	.name = "io_merged_recursive",
	1952	.private = offsetof(struct cfq_group, stats.merged),
	1953	.seq_show = cfqg_print_rwstat_recursive,
	1954	},
	1955	{
	1956	.name = "io_queued_recursive",
	1957	.private = offsetof(struct cfq_group, stats.queued),
	1958	.seq_show = cfqg_print_rwstat_recursive,
	1959	},
	1960	#ifdef CONFIG_DEBUG_BLK_CGROUP
	1961	{
	1962	.name = "avg_queue_size",
	1963	.seq_show = cfqg_print_avg_queue_size,
	1964	},
	1965	{
	1966	.name = "group_wait_time",
	1967	.private = offsetof(struct cfq_group, stats.group_wait_time),
	1968	.seq_show = cfqg_print_stat,
	1969	},
	1970	{
	1971	.name = "idle_time",
	1972	.private = offsetof(struct cfq_group, stats.idle_time),
	1973	.seq_show = cfqg_print_stat,
	1974	},
	1975	{
	1976	.name = "empty_time",
	1977	.private = offsetof(struct cfq_group, stats.empty_time),
	1978	.seq_show = cfqg_print_stat,
	1979	},
	1980	{
	1981	.name = "dequeue",
	1982	.private = offsetof(struct cfq_group, stats.dequeue),
	1983	.seq_show = cfqg_print_stat,
	1984	},
	1985	{
	1986	.name = "unaccounted_time",
	1987	.private = offsetof(struct cfq_group, stats.unaccounted_time),
	1988	.seq_show = cfqg_print_stat,
	1989	},
	1990	#endif /* CONFIG_DEBUG_BLK_CGROUP */
	1991	{ } /* terminate */
	1992	};
	1993	#else /* GROUP_IOSCHED */
	1994	static struct cfq_group cfq_lookup_create_cfqg(struct cfq_data cfqd,
	1995	struct blkcg *blkcg)
	1996	{
	1997	return cfqd->root_group;
	1998	}
	1999
	2000	static inline void
	2001	cfq_link_cfqq_cfqg(struct cfq_queue cfqq, struct cfq_group cfqg) {
	2002	cfqq->cfqg = cfqg;
	2003	}
	2004
	2005	#endif /* GROUP_IOSCHED */
	2006
	2007	/*
	2008	* The cfqd->service_trees holds all pending cfq_queue's that have
	2009	* requests waiting to be processed. It is sorted in the order that
	2010	* we will service the queues.
	2011	*/
	2012	static void cfq_service_tree_add(struct cfq_data cfqd, struct cfq_queue cfqq,
	2013	bool add_front)
	2014	{
	2015	struct rb_node *p, parent;
	2016	struct cfq_queue *__cfqq;
	2017	unsigned long rb_key;
	2018	struct cfq_rb_root *st;
	2019	int left;
	2020	int new_cfqq = 1;
	2021
	2022	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
	2023	if (cfq_class_idle(cfqq)) {
	2024	rb_key = CFQ_IDLE_DELAY;
	2025	parent = rb_last(&st->rb);
	2026	if (parent && parent != &cfqq->rb_node) {
	2027	__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
	2028	rb_key += __cfqq->rb_key;
	2029	} else
	2030	rb_key += jiffies;
	2031	} else if (!add_front) {
	2032	/*
	2033	* Get our rb key offset. Subtract any residual slice
	2034	* value carried from last service. A negative resid
	2035	* count indicates slice overrun, and this should position
	2036	* the next service time further away in the tree.
	2037	*/
	2038	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
	2039	rb_key -= cfqq->slice_resid;
	2040	cfqq->slice_resid = 0;
	2041	} else {
	2042	rb_key = -HZ;
	2043	__cfqq = cfq_rb_first(st);
	2044	rb_key += __cfqq ? __cfqq->rb_key : jiffies;
	2045	}
	2046
	2047	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
	2048	new_cfqq = 0;
	2049	/*
	2050	* same position, nothing more to do
	2051	*/
	2052	if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
	2053	return;
	2054
	2055	cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
	2056	cfqq->service_tree = NULL;
	2057	}
	2058
	2059	left = 1;
	2060	parent = NULL;
	2061	cfqq->service_tree = st;
	2062	p = &st->rb.rb_node;
	2063	while (*p) {
	2064	parent = *p;
	2065	__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
	2066
	2067	/*
	2068	* sort by key, that represents service time.
	2069	*/
	2070	if (time_before(rb_key, __cfqq->rb_key))
	2071	p = &parent->rb_left;
	2072	else {
	2073	p = &parent->rb_right;
	2074	left = 0;
	2075	}
	2076	}
	2077
	2078	if (left)
	2079	st->left = &cfqq->rb_node;
	2080
	2081	cfqq->rb_key = rb_key;
	2082	rb_link_node(&cfqq->rb_node, parent, p);
	2083	rb_insert_color(&cfqq->rb_node, &st->rb);
	2084	st->count++;
	2085	if (add_front \|\| !new_cfqq)
	2086	return;
	2087	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
	2088	}
	2089
	2090	static struct cfq_queue *
	2091	cfq_prio_tree_lookup(struct cfq_data cfqd, struct rb_root root,
	2092	sector_t sector, struct rb_node **ret_parent,
	2093	struct rb_node ***rb_link)
	2094	{
	2095	struct rb_node *p, parent;
	2096	struct cfq_queue *cfqq = NULL;
	2097
	2098	parent = NULL;
	2099	p = &root->rb_node;
	2100	while (*p) {
	2101	struct rb_node **n;
	2102
	2103	parent = *p;
	2104	cfqq = rb_entry(parent, struct cfq_queue, p_node);
	2105
	2106	/*
	2107	* Sort strictly based on sector. Smallest to the left,
	2108	* largest to the right.
	2109	*/
	2110	if (sector > blk_rq_pos(cfqq->next_rq))
	2111	n = &(*p)->rb_right;
	2112	else if (sector < blk_rq_pos(cfqq->next_rq))
	2113	n = &(*p)->rb_left;
	2114	else
	2115	break;
	2116	p = n;
	2117	cfqq = NULL;
	2118	}
	2119
	2120	*ret_parent = parent;
	2121	if (rb_link)
	2122	*rb_link = p;
	2123	return cfqq;
	2124	}
	2125
	2126	static void cfq_prio_tree_add(struct cfq_data cfqd, struct cfq_queue cfqq)
	2127	{
	2128	struct rb_node *p, parent;
	2129	struct cfq_queue *__cfqq;
	2130
	2131	if (cfqq->p_root) {
	2132	rb_erase(&cfqq->p_node, cfqq->p_root);
	2133	cfqq->p_root = NULL;
	2134	}
	2135
	2136	if (cfq_class_idle(cfqq))
	2137	return;
	2138	if (!cfqq->next_rq)
	2139	return;
	2140
	2141	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
	2142	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
	2143	blk_rq_pos(cfqq->next_rq), &parent, &p);
	2144	if (!__cfqq) {
	2145	rb_link_node(&cfqq->p_node, parent, p);
	2146	rb_insert_color(&cfqq->p_node, cfqq->p_root);
	2147	} else
	2148	cfqq->p_root = NULL;
	2149	}
	2150
	2151	/*
	2152	* Update cfqq's position in the service tree.
	2153	*/
	2154	static void cfq_resort_rr_list(struct cfq_data cfqd, struct cfq_queue cfqq)
	2155	{
	2156	/*
	2157	* Resorting requires the cfqq to be on the RR list already.
	2158	*/
	2159	if (cfq_cfqq_on_rr(cfqq)) {
	2160	cfq_service_tree_add(cfqd, cfqq, 0);
	2161	cfq_prio_tree_add(cfqd, cfqq);
	2162	}
	2163	}
	2164
	2165	/*
	2166	* add to busy list of queues for service, trying to be fair in ordering
	2167	* the pending list according to last request service
	2168	*/
	2169	static void cfq_add_cfqq_rr(struct cfq_data cfqd, struct cfq_queue cfqq)
	2170	{
	2171	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
	2172	BUG_ON(cfq_cfqq_on_rr(cfqq));
	2173	cfq_mark_cfqq_on_rr(cfqq);
	2174	cfqd->busy_queues++;
	2175	if (cfq_cfqq_sync(cfqq))
	2176	cfqd->busy_sync_queues++;
	2177
	2178	cfq_resort_rr_list(cfqd, cfqq);
	2179	}
	2180
	2181	/*
	2182	* Called when the cfqq no longer has requests pending, remove it from
	2183	* the service tree.
	2184	*/
	2185	static void cfq_del_cfqq_rr(struct cfq_data cfqd, struct cfq_queue cfqq)
	2186	{
	2187	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
	2188	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	2189	cfq_clear_cfqq_on_rr(cfqq);
	2190
	2191	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
	2192	cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
	2193	cfqq->service_tree = NULL;
	2194	}
	2195	if (cfqq->p_root) {
	2196	rb_erase(&cfqq->p_node, cfqq->p_root);
	2197	cfqq->p_root = NULL;
	2198	}
	2199
	2200	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
	2201	BUG_ON(!cfqd->busy_queues);
	2202	cfqd->busy_queues--;
	2203	if (cfq_cfqq_sync(cfqq))
	2204	cfqd->busy_sync_queues--;
	2205	}
	2206
	2207	/*
	2208	* rb tree support functions
	2209	*/
	2210	static void cfq_del_rq_rb(struct request *rq)
	2211	{
	2212	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	2213	const int sync = rq_is_sync(rq);
	2214
	2215	BUG_ON(!cfqq->queued[sync]);
	2216	cfqq->queued[sync]--;
	2217
	2218	elv_rb_del(&cfqq->sort_list, rq);
	2219
	2220	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
	2221	/*
	2222	* Queue will be deleted from service tree when we actually
	2223	* expire it later. Right now just remove it from prio tree
	2224	* as it is empty.
	2225	*/
	2226	if (cfqq->p_root) {
	2227	rb_erase(&cfqq->p_node, cfqq->p_root);
	2228	cfqq->p_root = NULL;
	2229	}
	2230	}
	2231	}
	2232
	2233	static void cfq_add_rq_rb(struct request *rq)
	2234	{
	2235	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	2236	struct cfq_data *cfqd = cfqq->cfqd;
	2237	struct request *prev;
	2238
	2239	cfqq->queued[rq_is_sync(rq)]++;
	2240
	2241	elv_rb_add(&cfqq->sort_list, rq);
	2242
	2243	if (!cfq_cfqq_on_rr(cfqq))
	2244	cfq_add_cfqq_rr(cfqd, cfqq);
	2245
	2246	/*
	2247	* check if this request is a better next-serve candidate
	2248	*/
	2249	prev = cfqq->next_rq;
	2250	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
	2251
	2252	/*
	2253	* adjust priority tree position, if ->next_rq changes
	2254	*/
	2255	if (prev != cfqq->next_rq)
	2256	cfq_prio_tree_add(cfqd, cfqq);
	2257
	2258	BUG_ON(!cfqq->next_rq);
	2259	}
	2260
	2261	static void cfq_reposition_rq_rb(struct cfq_queue cfqq, struct request rq)
	2262	{
	2263	elv_rb_del(&cfqq->sort_list, rq);
	2264	cfqq->queued[rq_is_sync(rq)]--;
	2265	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
	2266	cfq_add_rq_rb(rq);
	2267	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
	2268	rq->cmd_flags);
	2269	}
	2270
	2271	static struct request *
	2272	cfq_find_rq_fmerge(struct cfq_data cfqd, struct bio bio)
	2273	{
	2274	struct task_struct *tsk = current;
	2275	struct cfq_io_cq *cic;
	2276	struct cfq_queue *cfqq;
	2277
	2278	cic = cfq_cic_lookup(cfqd, tsk->io_context);
	2279	if (!cic)
	2280	return NULL;
	2281
	2282	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
	2283	if (cfqq)
	2284	return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
	2285
	2286	return NULL;
	2287	}
	2288
	2289	static void cfq_activate_request(struct request_queue q, struct request rq)
	2290	{
	2291	struct cfq_data *cfqd = q->elevator->elevator_data;
	2292
	2293	cfqd->rq_in_driver++;
	2294	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
	2295	cfqd->rq_in_driver);
	2296
	2297	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
	2298	}
	2299
	2300	static void cfq_deactivate_request(struct request_queue q, struct request rq)
	2301	{
	2302	struct cfq_data *cfqd = q->elevator->elevator_data;
	2303
	2304	WARN_ON(!cfqd->rq_in_driver);
	2305	cfqd->rq_in_driver--;
	2306	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
	2307	cfqd->rq_in_driver);
	2308	}
	2309
	2310	static void cfq_remove_request(struct request *rq)
	2311	{
	2312	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	2313
	2314	if (cfqq->next_rq == rq)
	2315	cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
	2316
	2317	list_del_init(&rq->queuelist);
	2318	cfq_del_rq_rb(rq);
	2319
	2320	cfqq->cfqd->rq_queued--;
	2321	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
	2322	if (rq->cmd_flags & REQ_PRIO) {
	2323	WARN_ON(!cfqq->prio_pending);
	2324	cfqq->prio_pending--;
	2325	}
	2326	}
	2327
	2328	static int cfq_merge(struct request_queue q, struct request *req,
	2329	struct bio *bio)
	2330	{
	2331	struct cfq_data *cfqd = q->elevator->elevator_data;
	2332	struct request *__rq;
	2333
	2334	__rq = cfq_find_rq_fmerge(cfqd, bio);
	2335	if (__rq && elv_rq_merge_ok(__rq, bio)) {
	2336	*req = __rq;
	2337	return ELEVATOR_FRONT_MERGE;
	2338	}
	2339
	2340	return ELEVATOR_NO_MERGE;
	2341	}
	2342
	2343	static void cfq_merged_request(struct request_queue q, struct request req,
	2344	int type)
	2345	{
	2346	if (type == ELEVATOR_FRONT_MERGE) {
	2347	struct cfq_queue *cfqq = RQ_CFQQ(req);
	2348
	2349	cfq_reposition_rq_rb(cfqq, req);
	2350	}
	2351	}
	2352
	2353	static void cfq_bio_merged(struct request_queue q, struct request req,
	2354	struct bio *bio)
	2355	{
	2356	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
	2357	}
	2358
	2359	static void
	2360	cfq_merged_requests(struct request_queue q, struct request rq,
	2361	struct request *next)
	2362	{
	2363	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	2364	struct cfq_data *cfqd = q->elevator->elevator_data;
	2365
	2366	/*
	2367	* reposition in fifo if next is older than rq
	2368	*/
	2369	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
	2370	time_before(rq_fifo_time(next), rq_fifo_time(rq)) &&
	2371	cfqq == RQ_CFQQ(next)) {
	2372	list_move(&rq->queuelist, &next->queuelist);
	2373	rq_set_fifo_time(rq, rq_fifo_time(next));
	2374	}
	2375
	2376	if (cfqq->next_rq == next)
	2377	cfqq->next_rq = rq;
	2378	cfq_remove_request(next);
	2379	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
	2380
	2381	cfqq = RQ_CFQQ(next);
	2382	/*
	2383	* all requests of this queue are merged to other queues, delete it
	2384	* from the service tree. If it's the active_queue,
	2385	* cfq_dispatch_requests() will choose to expire it or do idle
	2386	*/
	2387	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
	2388	cfqq != cfqd->active_queue)
	2389	cfq_del_cfqq_rr(cfqd, cfqq);
	2390	}
	2391
	2392	static int cfq_allow_merge(struct request_queue q, struct request rq,
	2393	struct bio *bio)
	2394	{
	2395	struct cfq_data *cfqd = q->elevator->elevator_data;
	2396	struct cfq_io_cq *cic;
	2397	struct cfq_queue *cfqq;
	2398
	2399	/*
	2400	* Disallow merge of a sync bio into an async request.
	2401	*/
	2402	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
	2403	return false;
	2404
	2405	/*
	2406	* Lookup the cfqq that this bio will be queued with and allow
	2407	* merge only if rq is queued there.
	2408	*/
	2409	cic = cfq_cic_lookup(cfqd, current->io_context);
	2410	if (!cic)
	2411	return false;
	2412
	2413	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
	2414	return cfqq == RQ_CFQQ(rq);
	2415	}
	2416
	2417	static inline void cfq_del_timer(struct cfq_data cfqd, struct cfq_queue cfqq)
	2418	{
	2419	del_timer(&cfqd->idle_slice_timer);
	2420	cfqg_stats_update_idle_time(cfqq->cfqg);
	2421	}
	2422
	2423	static void __cfq_set_active_queue(struct cfq_data *cfqd,
	2424	struct cfq_queue *cfqq)
	2425	{
	2426	if (cfqq) {
	2427	cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
	2428	cfqd->serving_wl_class, cfqd->serving_wl_type);
	2429	cfqg_stats_update_avg_queue_size(cfqq->cfqg);
	2430	cfqq->slice_start = 0;
	2431	cfqq->dispatch_start = jiffies;
	2432	cfqq->allocated_slice = 0;
	2433	cfqq->slice_end = 0;
	2434	cfqq->slice_dispatch = 0;
	2435	cfqq->nr_sectors = 0;
	2436
	2437	cfq_clear_cfqq_wait_request(cfqq);
	2438	cfq_clear_cfqq_must_dispatch(cfqq);
	2439	cfq_clear_cfqq_must_alloc_slice(cfqq);
	2440	cfq_clear_cfqq_fifo_expire(cfqq);
	2441	cfq_mark_cfqq_slice_new(cfqq);
	2442
	2443	cfq_del_timer(cfqd, cfqq);
	2444	}
	2445
	2446	cfqd->active_queue = cfqq;
	2447	}
	2448
	2449	/*
	2450	* current cfqq expired its slice (or was too idle), select new one
	2451	*/
	2452	static void
	2453	__cfq_slice_expired(struct cfq_data cfqd, struct cfq_queue cfqq,
	2454	bool timed_out)
	2455	{
	2456	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
	2457
	2458	if (cfq_cfqq_wait_request(cfqq))
	2459	cfq_del_timer(cfqd, cfqq);
	2460
	2461	cfq_clear_cfqq_wait_request(cfqq);
	2462	cfq_clear_cfqq_wait_busy(cfqq);
	2463
	2464	/*
	2465	* If this cfqq is shared between multiple processes, check to
	2466	* make sure that those processes are still issuing I/Os within
	2467	* the mean seek distance. If not, it may be time to break the
	2468	* queues apart again.
	2469	*/
	2470	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
	2471	cfq_mark_cfqq_split_coop(cfqq);
	2472
	2473	/*
	2474	* store what was left of this slice, if the queue idled/timed out
	2475	*/
	2476	if (timed_out) {
	2477	if (cfq_cfqq_slice_new(cfqq))
	2478	cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
	2479	else
	2480	cfqq->slice_resid = cfqq->slice_end - jiffies;
	2481	cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
	2482	}
	2483
	2484	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
	2485
	2486	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
	2487	cfq_del_cfqq_rr(cfqd, cfqq);
	2488
	2489	cfq_resort_rr_list(cfqd, cfqq);
	2490
	2491	if (cfqq == cfqd->active_queue)
	2492	cfqd->active_queue = NULL;
	2493
	2494	if (cfqd->active_cic) {
	2495	put_io_context(cfqd->active_cic->icq.ioc);
	2496	cfqd->active_cic = NULL;
	2497	}
	2498	}
	2499
	2500	static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
	2501	{
	2502	struct cfq_queue *cfqq = cfqd->active_queue;
	2503
	2504	if (cfqq)
	2505	__cfq_slice_expired(cfqd, cfqq, timed_out);
	2506	}
	2507
	2508	/*
	2509	* Get next queue for service. Unless we have a queue preemption,
	2510	* we'll simply select the first cfqq in the service tree.
	2511	*/
	2512	static struct cfq_queue cfq_get_next_queue(struct cfq_data cfqd)
	2513	{
	2514	struct cfq_rb_root *st = st_for(cfqd->serving_group,
	2515	cfqd->serving_wl_class, cfqd->serving_wl_type);
	2516
	2517	if (!cfqd->rq_queued)
	2518	return NULL;
	2519
	2520	/* There is nothing to dispatch */
	2521	if (!st)
	2522	return NULL;
	2523	if (RB_EMPTY_ROOT(&st->rb))
	2524	return NULL;
	2525	return cfq_rb_first(st);
	2526	}
	2527
	2528	static struct cfq_queue cfq_get_next_queue_forced(struct cfq_data cfqd)
	2529	{
	2530	struct cfq_group *cfqg;
	2531	struct cfq_queue *cfqq;
	2532	int i, j;
	2533	struct cfq_rb_root *st;
	2534
	2535	if (!cfqd->rq_queued)
	2536	return NULL;
	2537
	2538	cfqg = cfq_get_next_cfqg(cfqd);
	2539	if (!cfqg)
	2540	return NULL;
	2541
	2542	for_each_cfqg_st(cfqg, i, j, st)
	2543	if ((cfqq = cfq_rb_first(st)) != NULL)
	2544	return cfqq;
	2545	return NULL;
	2546	}
	2547
	2548	/*
	2549	* Get and set a new active queue for service.
	2550	*/
	2551	static struct cfq_queue cfq_set_active_queue(struct cfq_data cfqd,
	2552	struct cfq_queue *cfqq)
	2553	{
	2554	if (!cfqq)
	2555	cfqq = cfq_get_next_queue(cfqd);
	2556
	2557	__cfq_set_active_queue(cfqd, cfqq);
	2558	return cfqq;
	2559	}
	2560
	2561	static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
	2562	struct request *rq)
	2563	{
	2564	if (blk_rq_pos(rq) >= cfqd->last_position)
	2565	return blk_rq_pos(rq) - cfqd->last_position;
	2566	else
	2567	return cfqd->last_position - blk_rq_pos(rq);
	2568	}
	2569
	2570	static inline int cfq_rq_close(struct cfq_data cfqd, struct cfq_queue cfqq,
	2571	struct request *rq)
	2572	{
	2573	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
	2574	}
	2575
	2576	static struct cfq_queue cfqq_close(struct cfq_data cfqd,
	2577	struct cfq_queue *cur_cfqq)
	2578	{
	2579	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
	2580	struct rb_node parent, node;
	2581	struct cfq_queue *__cfqq;
	2582	sector_t sector = cfqd->last_position;
	2583
	2584	if (RB_EMPTY_ROOT(root))
	2585	return NULL;
	2586
	2587	/*
	2588	* First, if we find a request starting at the end of the last
	2589	* request, choose it.
	2590	*/
	2591	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
	2592	if (__cfqq)
	2593	return __cfqq;
	2594
	2595	/*
	2596	* If the exact sector wasn't found, the parent of the NULL leaf
	2597	* will contain the closest sector.
	2598	*/
	2599	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
	2600	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
	2601	return __cfqq;
	2602
	2603	if (blk_rq_pos(__cfqq->next_rq) < sector)
	2604	node = rb_next(&__cfqq->p_node);
	2605	else
	2606	node = rb_prev(&__cfqq->p_node);
	2607	if (!node)
	2608	return NULL;
	2609
	2610	__cfqq = rb_entry(node, struct cfq_queue, p_node);
	2611	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
	2612	return __cfqq;
	2613
	2614	return NULL;
	2615	}
	2616
	2617	/*
	2618	* cfqd - obvious
	2619	* cur_cfqq - passed in so that we don't decide that the current queue is
	2620	* closely cooperating with itself.
	2621	*
	2622	* So, basically we're assuming that that cur_cfqq has dispatched at least
	2623	* one request, and that cfqd->last_position reflects a position on the disk
	2624	* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
	2625	* assumption.
	2626	*/
	2627	static struct cfq_queue cfq_close_cooperator(struct cfq_data cfqd,
	2628	struct cfq_queue *cur_cfqq)
	2629	{
	2630	struct cfq_queue *cfqq;
	2631
	2632	if (cfq_class_idle(cur_cfqq))
	2633	return NULL;
	2634	if (!cfq_cfqq_sync(cur_cfqq))
	2635	return NULL;
	2636	if (CFQQ_SEEKY(cur_cfqq))
	2637	return NULL;
	2638
	2639	/*
	2640	* Don't search priority tree if it's the only queue in the group.
	2641	*/
	2642	if (cur_cfqq->cfqg->nr_cfqq == 1)
	2643	return NULL;
	2644
	2645	/*
	2646	* We should notice if some of the queues are cooperating, eg
	2647	* working closely on the same area of the disk. In that case,
	2648	* we can group them together and don't waste time idling.
	2649	*/
	2650	cfqq = cfqq_close(cfqd, cur_cfqq);
	2651	if (!cfqq)
	2652	return NULL;
	2653
	2654	/* If new queue belongs to different cfq_group, don't choose it */
	2655	if (cur_cfqq->cfqg != cfqq->cfqg)
	2656	return NULL;
	2657
	2658	/*
	2659	* It only makes sense to merge sync queues.
	2660	*/
	2661	if (!cfq_cfqq_sync(cfqq))
	2662	return NULL;
	2663	if (CFQQ_SEEKY(cfqq))
	2664	return NULL;
	2665
	2666	/*
	2667	* Do not merge queues of different priority classes
	2668	*/
	2669	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
	2670	return NULL;
	2671
	2672	return cfqq;
	2673	}
	2674
	2675	/*
	2676	* Determine whether we should enforce idle window for this queue.
	2677	*/
	2678
	2679	static bool cfq_should_idle(struct cfq_data cfqd, struct cfq_queue cfqq)
	2680	{
	2681	enum wl_class_t wl_class = cfqq_class(cfqq);
	2682	struct cfq_rb_root *st = cfqq->service_tree;
	2683
	2684	BUG_ON(!st);
	2685	BUG_ON(!st->count);
	2686
	2687	if (!cfqd->cfq_slice_idle)
	2688	return false;
	2689
	2690	/* We never do for idle class queues. */
	2691	if (wl_class == IDLE_WORKLOAD)
	2692	return false;
	2693
	2694	/* We do for queues that were marked with idle window flag. */
	2695	if (cfq_cfqq_idle_window(cfqq) &&
	2696	!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
	2697	return true;
	2698
	2699	/*
	2700	* Otherwise, we do only if they are the last ones
	2701	* in their service tree.
	2702	*/
	2703	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
	2704	!cfq_io_thinktime_big(cfqd, &st->ttime, false))
	2705	return true;
	2706	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
	2707	return false;
	2708	}
	2709
	2710	static void cfq_arm_slice_timer(struct cfq_data *cfqd)
	2711	{
	2712	struct cfq_queue *cfqq = cfqd->active_queue;
	2713	struct cfq_io_cq *cic;
	2714	unsigned long sl, group_idle = 0;
	2715
	2716	/*
	2717	* SSD device without seek penalty, disable idling. But only do so
	2718	* for devices that support queuing, otherwise we still have a problem
	2719	* with sync vs async workloads.
	2720	*/
	2721	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
	2722	return;
	2723
	2724	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
	2725	WARN_ON(cfq_cfqq_slice_new(cfqq));
	2726
	2727	/*
	2728	* idle is disabled, either manually or by past process history
	2729	*/
	2730	if (!cfq_should_idle(cfqd, cfqq)) {
	2731	/* no queue idling. Check for group idling */
	2732	if (cfqd->cfq_group_idle)
	2733	group_idle = cfqd->cfq_group_idle;
	2734	else
	2735	return;
	2736	}
	2737
	2738	/*
	2739	* still active requests from this queue, don't idle
	2740	*/
	2741	if (cfqq->dispatched)
	2742	return;
	2743
	2744	/*
	2745	* task has exited, don't wait
	2746	*/
	2747	cic = cfqd->active_cic;
	2748	if (!cic \|\| !atomic_read(&cic->icq.ioc->active_ref))
	2749	return;
	2750
	2751	/*
	2752	* If our average think time is larger than the remaining time
	2753	* slice, then don't idle. This avoids overrunning the allotted
	2754	* time slice.
	2755	*/
	2756	if (sample_valid(cic->ttime.ttime_samples) &&
	2757	(cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
	2758	cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
	2759	cic->ttime.ttime_mean);
	2760	return;
	2761	}
	2762
	2763	/* There are other queues in the group, don't do group idle */
	2764	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
	2765	return;
	2766
	2767	cfq_mark_cfqq_wait_request(cfqq);
	2768
	2769	if (group_idle)
	2770	sl = cfqd->cfq_group_idle;
	2771	else
	2772	sl = cfqd->cfq_slice_idle;
	2773
	2774	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
	2775	cfqg_stats_set_start_idle_time(cfqq->cfqg);
	2776	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
	2777	group_idle ? 1 : 0);
	2778	}
	2779
	2780	/*
	2781	* Move request from internal lists to the request queue dispatch list.
	2782	*/
	2783	static void cfq_dispatch_insert(struct request_queue q, struct request rq)
	2784	{
	2785	struct cfq_data *cfqd = q->elevator->elevator_data;
	2786	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	2787
	2788	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
	2789
	2790	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
	2791	cfq_remove_request(rq);
	2792	cfqq->dispatched++;
	2793	(RQ_CFQG(rq))->dispatched++;
	2794	elv_dispatch_sort(q, rq);
	2795
	2796	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
	2797	cfqq->nr_sectors += blk_rq_sectors(rq);
	2798	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
	2799	}
	2800
	2801	/*
	2802	* return expired entry, or NULL to just start from scratch in rbtree
	2803	*/
	2804	static struct request cfq_check_fifo(struct cfq_queue cfqq)
	2805	{
	2806	struct request *rq = NULL;
	2807
	2808	if (cfq_cfqq_fifo_expire(cfqq))
	2809	return NULL;
	2810
	2811	cfq_mark_cfqq_fifo_expire(cfqq);
	2812
	2813	if (list_empty(&cfqq->fifo))
	2814	return NULL;
	2815
	2816	rq = rq_entry_fifo(cfqq->fifo.next);
	2817	if (time_before(jiffies, rq_fifo_time(rq)))
	2818	rq = NULL;
	2819
	2820	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
	2821	return rq;
	2822	}
	2823
	2824	static inline int
	2825	cfq_prio_to_maxrq(struct cfq_data cfqd, struct cfq_queue cfqq)
	2826	{
	2827	const int base_rq = cfqd->cfq_slice_async_rq;
	2828
	2829	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
	2830
	2831	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
	2832	}
	2833
	2834	/*
	2835	* Must be called with the queue_lock held.
	2836	*/
	2837	static int cfqq_process_refs(struct cfq_queue *cfqq)
	2838	{
	2839	int process_refs, io_refs;
	2840
	2841	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
	2842	process_refs = cfqq->ref - io_refs;
	2843	BUG_ON(process_refs < 0);
	2844	return process_refs;
	2845	}
	2846
	2847	static void cfq_setup_merge(struct cfq_queue cfqq, struct cfq_queue new_cfqq)
	2848	{
	2849	int process_refs, new_process_refs;
	2850	struct cfq_queue *__cfqq;
	2851
	2852	/*
	2853	* If there are no process references on the new_cfqq, then it is
	2854	* unsafe to follow the ->new_cfqq chain as other cfqq's in the
	2855	* chain may have dropped their last reference (not just their
	2856	* last process reference).
	2857	*/
	2858	if (!cfqq_process_refs(new_cfqq))
	2859	return;
	2860
	2861	/* Avoid a circular list and skip interim queue merges */
	2862	while ((__cfqq = new_cfqq->new_cfqq)) {
	2863	if (__cfqq == cfqq)
	2864	return;
	2865	new_cfqq = __cfqq;
	2866	}
	2867
	2868	process_refs = cfqq_process_refs(cfqq);
	2869	new_process_refs = cfqq_process_refs(new_cfqq);
	2870	/*
	2871	* If the process for the cfqq has gone away, there is no
	2872	* sense in merging the queues.
	2873	*/
	2874	if (process_refs == 0 \|\| new_process_refs == 0)
	2875	return;
	2876
	2877	/*
	2878	* Merge in the direction of the lesser amount of work.
	2879	*/
	2880	if (new_process_refs >= process_refs) {
	2881	cfqq->new_cfqq = new_cfqq;
	2882	new_cfqq->ref += process_refs;
	2883	} else {
	2884	new_cfqq->new_cfqq = cfqq;
	2885	cfqq->ref += new_process_refs;
	2886	}
	2887	}
	2888
	2889	static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
	2890	struct cfq_group *cfqg, enum wl_class_t wl_class)
	2891	{
	2892	struct cfq_queue *queue;
	2893	int i;
	2894	bool key_valid = false;
	2895	unsigned long lowest_key = 0;
	2896	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
	2897
	2898	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
	2899	/* select the one with lowest rb_key */
	2900	queue = cfq_rb_first(st_for(cfqg, wl_class, i));
	2901	if (queue &&
	2902	(!key_valid \|\| time_before(queue->rb_key, lowest_key))) {
	2903	lowest_key = queue->rb_key;
	2904	cur_best = i;
	2905	key_valid = true;
	2906	}
	2907	}
	2908
	2909	return cur_best;
	2910	}
	2911
	2912	static void
	2913	choose_wl_class_and_type(struct cfq_data cfqd, struct cfq_group cfqg)
	2914	{
	2915	unsigned slice;
	2916	unsigned count;
	2917	struct cfq_rb_root *st;
	2918	unsigned group_slice;
	2919	enum wl_class_t original_class = cfqd->serving_wl_class;
	2920
	2921	/* Choose next priority. RT > BE > IDLE */
	2922	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
	2923	cfqd->serving_wl_class = RT_WORKLOAD;
	2924	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
	2925	cfqd->serving_wl_class = BE_WORKLOAD;
	2926	else {
	2927	cfqd->serving_wl_class = IDLE_WORKLOAD;
	2928	cfqd->workload_expires = jiffies + 1;
	2929	return;
	2930	}
	2931
	2932	if (original_class != cfqd->serving_wl_class)
	2933	goto new_workload;
	2934
	2935	/*
	2936	* For RT and BE, we have to choose also the type
	2937	* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
	2938	* expiration time
	2939	*/
	2940	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
	2941	count = st->count;
	2942
	2943	/*
	2944	* check workload expiration, and that we still have other queues ready
	2945	*/
	2946	if (count && !time_after(jiffies, cfqd->workload_expires))
	2947	return;
	2948
	2949	new_workload:
	2950	/* otherwise select new workload type */
	2951	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
	2952	cfqd->serving_wl_class);
	2953	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
	2954	count = st->count;
	2955
	2956	/*
	2957	* the workload slice is computed as a fraction of target latency
	2958	* proportional to the number of queues in that workload, over
	2959	* all the queues in the same priority class
	2960	*/
	2961	group_slice = cfq_group_slice(cfqd, cfqg);
	2962
	2963	slice = group_slice * count /
	2964	max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
	2965	cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
	2966	cfqg));
	2967
	2968	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
	2969	unsigned int tmp;
	2970
	2971	/*
	2972	* Async queues are currently system wide. Just taking
	2973	* proportion of queues with-in same group will lead to higher
	2974	* async ratio system wide as generally root group is going
	2975	* to have higher weight. A more accurate thing would be to
	2976	* calculate system wide asnc/sync ratio.
	2977	*/
	2978	tmp = cfqd->cfq_target_latency *
	2979	cfqg_busy_async_queues(cfqd, cfqg);
	2980	tmp = tmp/cfqd->busy_queues;
	2981	slice = min_t(unsigned, slice, tmp);
	2982
	2983	/* async workload slice is scaled down according to
	2984	* the sync/async slice ratio. */
	2985	slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
	2986	} else
	2987	/* sync workload slice is at least 2 * cfq_slice_idle */
	2988	slice = max(slice, 2 * cfqd->cfq_slice_idle);
	2989
	2990	slice = max_t(unsigned, slice, CFQ_MIN_TT);
	2991	cfq_log(cfqd, "workload slice:%d", slice);
	2992	cfqd->workload_expires = jiffies + slice;
	2993	}
	2994
	2995	static struct cfq_group cfq_get_next_cfqg(struct cfq_data cfqd)
	2996	{
	2997	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	2998	struct cfq_group *cfqg;
	2999
	3000	if (RB_EMPTY_ROOT(&st->rb))
	3001	return NULL;
	3002	cfqg = cfq_rb_first_group(st);
	3003	update_min_vdisktime(st);
	3004	return cfqg;
	3005	}
	3006
	3007	static void cfq_choose_cfqg(struct cfq_data *cfqd)
	3008	{
	3009	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
	3010
	3011	cfqd->serving_group = cfqg;
	3012
	3013	/* Restore the workload type data */
	3014	if (cfqg->saved_wl_slice) {
	3015	cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
	3016	cfqd->serving_wl_type = cfqg->saved_wl_type;
	3017	cfqd->serving_wl_class = cfqg->saved_wl_class;
	3018	} else
	3019	cfqd->workload_expires = jiffies - 1;
	3020
	3021	choose_wl_class_and_type(cfqd, cfqg);
	3022	}
	3023
	3024	/*
	3025	* Select a queue for service. If we have a current active queue,
	3026	* check whether to continue servicing it, or retrieve and set a new one.
	3027	*/
	3028	static struct cfq_queue cfq_select_queue(struct cfq_data cfqd)
	3029	{
	3030	struct cfq_queue cfqq, new_cfqq = NULL;
	3031
	3032	cfqq = cfqd->active_queue;
	3033	if (!cfqq)
	3034	goto new_queue;
	3035
	3036	if (!cfqd->rq_queued)
	3037	return NULL;
	3038
	3039	/*
	3040	* We were waiting for group to get backlogged. Expire the queue
	3041	*/
	3042	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
	3043	goto expire;
	3044
	3045	/*
	3046	* The active queue has run out of time, expire it and select new.
	3047	*/
	3048	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
	3049	/*
	3050	* If slice had not expired at the completion of last request
	3051	* we might not have turned on wait_busy flag. Don't expire
	3052	* the queue yet. Allow the group to get backlogged.
	3053	*
	3054	* The very fact that we have used the slice, that means we
	3055	* have been idling all along on this queue and it should be
	3056	* ok to wait for this request to complete.
	3057	*/
	3058	if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
	3059	&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
	3060	cfqq = NULL;
	3061	goto keep_queue;
	3062	} else
	3063	goto check_group_idle;
	3064	}
	3065
	3066	/*
	3067	* The active queue has requests and isn't expired, allow it to
	3068	* dispatch.
	3069	*/
	3070	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
	3071	goto keep_queue;
	3072
	3073	/*
	3074	* If another queue has a request waiting within our mean seek
	3075	* distance, let it run. The expire code will check for close
	3076	* cooperators and put the close queue at the front of the service
	3077	* tree. If possible, merge the expiring queue with the new cfqq.
	3078	*/
	3079	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
	3080	if (new_cfqq) {
	3081	if (!cfqq->new_cfqq)
	3082	cfq_setup_merge(cfqq, new_cfqq);
	3083	goto expire;
	3084	}
	3085
	3086	/*
	3087	* No requests pending. If the active queue still has requests in
	3088	* flight or is idling for a new request, allow either of these
	3089	* conditions to happen (or time out) before selecting a new queue.
	3090	*/
	3091	if (timer_pending(&cfqd->idle_slice_timer)) {
	3092	cfqq = NULL;
	3093	goto keep_queue;
	3094	}
	3095
	3096	/*
	3097	* This is a deep seek queue, but the device is much faster than
	3098	* the queue can deliver, don't idle
	3099	**/
	3100	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
	3101	(cfq_cfqq_slice_new(cfqq) \|\|
	3102	(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
	3103	cfq_clear_cfqq_deep(cfqq);
	3104	cfq_clear_cfqq_idle_window(cfqq);
	3105	}
	3106
	3107	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
	3108	cfqq = NULL;
	3109	goto keep_queue;
	3110	}
	3111
	3112	/*
	3113	* If group idle is enabled and there are requests dispatched from
	3114	* this group, wait for requests to complete.
	3115	*/
	3116	check_group_idle:
	3117	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
	3118	cfqq->cfqg->dispatched &&
	3119	!cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
	3120	cfqq = NULL;
	3121	goto keep_queue;
	3122	}
	3123
	3124	expire:
	3125	cfq_slice_expired(cfqd, 0);
	3126	new_queue:
	3127	/*
	3128	* Current queue expired. Check if we have to switch to a new
	3129	* service tree
	3130	*/
	3131	if (!new_cfqq)
	3132	cfq_choose_cfqg(cfqd);
	3133
	3134	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
	3135	keep_queue:
	3136	return cfqq;
	3137	}
	3138
	3139	static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
	3140	{
	3141	int dispatched = 0;
	3142
	3143	while (cfqq->next_rq) {
	3144	cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
	3145	dispatched++;
	3146	}
	3147
	3148	BUG_ON(!list_empty(&cfqq->fifo));
	3149
	3150	/* By default cfqq is not expired if it is empty. Do it explicitly */
	3151	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
	3152	return dispatched;
	3153	}
	3154
	3155	/*
	3156	* Drain our current requests. Used for barriers and when switching
	3157	* io schedulers on-the-fly.
	3158	*/
	3159	static int cfq_forced_dispatch(struct cfq_data *cfqd)
	3160	{
	3161	struct cfq_queue *cfqq;
	3162	int dispatched = 0;
	3163
	3164	/* Expire the timeslice of the current active queue first */
	3165	cfq_slice_expired(cfqd, 0);
	3166	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
	3167	__cfq_set_active_queue(cfqd, cfqq);
	3168	dispatched += __cfq_forced_dispatch_cfqq(cfqq);
	3169	}
	3170
	3171	BUG_ON(cfqd->busy_queues);
	3172
	3173	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
	3174	return dispatched;
	3175	}
	3176
	3177	static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
	3178	struct cfq_queue *cfqq)
	3179	{
	3180	/* the queue hasn't finished any request, can't estimate */
	3181	if (cfq_cfqq_slice_new(cfqq))
	3182	return true;
	3183	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
	3184	cfqq->slice_end))
	3185	return true;
	3186
	3187	return false;
	3188	}
	3189
	3190	static bool cfq_may_dispatch(struct cfq_data cfqd, struct cfq_queue cfqq)
	3191	{
	3192	unsigned int max_dispatch;
	3193
	3194	/*
	3195	* Drain async requests before we start sync IO
	3196	*/
	3197	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
	3198	return false;
	3199
	3200	/*
	3201	* If this is an async queue and we have sync IO in flight, let it wait
	3202	*/
	3203	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
	3204	return false;
	3205
	3206	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
	3207	if (cfq_class_idle(cfqq))
	3208	max_dispatch = 1;
	3209
	3210	/*
	3211	* Does this cfqq already have too much IO in flight?
	3212	*/
	3213	if (cfqq->dispatched >= max_dispatch) {
	3214	bool promote_sync = false;
	3215	/*
	3216	* idle queue must always only have a single IO in flight
	3217	*/
	3218	if (cfq_class_idle(cfqq))
	3219	return false;
	3220
	3221	/*
	3222	* If there is only one sync queue
	3223	* we can ignore async queue here and give the sync
	3224	* queue no dispatch limit. The reason is a sync queue can
	3225	* preempt async queue, limiting the sync queue doesn't make
	3226	* sense. This is useful for aiostress test.
	3227	*/
	3228	if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
	3229	promote_sync = true;
	3230
	3231	/*
	3232	* We have other queues, don't allow more IO from this one
	3233	*/
	3234	if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
	3235	!promote_sync)
	3236	return false;
	3237
	3238	/*
	3239	* Sole queue user, no limit
	3240	*/
	3241	if (cfqd->busy_queues == 1 \|\| promote_sync)
	3242	max_dispatch = -1;
	3243	else
	3244	/*
	3245	* Normally we start throttling cfqq when cfq_quantum/2
	3246	* requests have been dispatched. But we can drive
	3247	* deeper queue depths at the beginning of slice
	3248	* subjected to upper limit of cfq_quantum.
	3249	* */
	3250	max_dispatch = cfqd->cfq_quantum;
	3251	}
	3252
	3253	/*
	3254	* Async queues must wait a bit before being allowed dispatch.
	3255	* We also ramp up the dispatch depth gradually for async IO,
	3256	* based on the last sync IO we serviced
	3257	*/
	3258	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
	3259	unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
	3260	unsigned int depth;
	3261
	3262	depth = last_sync / cfqd->cfq_slice[1];
	3263	if (!depth && !cfqq->dispatched)
	3264	depth = 1;
	3265	if (depth < max_dispatch)
	3266	max_dispatch = depth;
	3267	}
	3268
	3269	/*
	3270	* If we're below the current max, allow a dispatch
	3271	*/
	3272	return cfqq->dispatched < max_dispatch;
	3273	}
	3274
	3275	/*
	3276	* Dispatch a request from cfqq, moving them to the request queue
	3277	* dispatch list.
	3278	*/
	3279	static bool cfq_dispatch_request(struct cfq_data cfqd, struct cfq_queue cfqq)
	3280	{
	3281	struct request *rq;
	3282
	3283	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
	3284
	3285	if (!cfq_may_dispatch(cfqd, cfqq))
	3286	return false;
	3287
	3288	/*
	3289	* follow expired path, else get first next available
	3290	*/
	3291	rq = cfq_check_fifo(cfqq);
	3292	if (!rq)
	3293	rq = cfqq->next_rq;
	3294
	3295	/*
	3296	* insert request into driver dispatch list
	3297	*/
	3298	cfq_dispatch_insert(cfqd->queue, rq);
	3299
	3300	if (!cfqd->active_cic) {
	3301	struct cfq_io_cq *cic = RQ_CIC(rq);
	3302
	3303	atomic_long_inc(&cic->icq.ioc->refcount);
	3304	cfqd->active_cic = cic;
	3305	}
	3306
	3307	return true;
	3308	}
	3309
	3310	/*
	3311	* Find the cfqq that we need to service and move a request from that to the
	3312	* dispatch list
	3313	*/
	3314	static int cfq_dispatch_requests(struct request_queue *q, int force)
	3315	{
	3316	struct cfq_data *cfqd = q->elevator->elevator_data;
	3317	struct cfq_queue *cfqq;
	3318
	3319	if (!cfqd->busy_queues)
	3320	return 0;
	3321
	3322	if (unlikely(force))
	3323	return cfq_forced_dispatch(cfqd);
	3324
	3325	cfqq = cfq_select_queue(cfqd);
	3326	if (!cfqq)
	3327	return 0;
	3328
	3329	/*
	3330	* Dispatch a request from this cfqq, if it is allowed
	3331	*/
	3332	if (!cfq_dispatch_request(cfqd, cfqq))
	3333	return 0;
	3334
	3335	cfqq->slice_dispatch++;
	3336	cfq_clear_cfqq_must_dispatch(cfqq);
	3337
	3338	/*
	3339	* expire an async queue immediately if it has used up its slice. idle
	3340	* queue always expire after 1 dispatch round.
	3341	*/
	3342	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
	3343	cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) \|\|
	3344	cfq_class_idle(cfqq))) {
	3345	cfqq->slice_end = jiffies + 1;
	3346	cfq_slice_expired(cfqd, 0);
	3347	}
	3348
	3349	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
	3350	return 1;
	3351	}
	3352
	3353	/*
	3354	* task holds one reference to the queue, dropped when task exits. each rq
	3355	* in-flight on this queue also holds a reference, dropped when rq is freed.
	3356	*
	3357	* Each cfq queue took a reference on the parent group. Drop it now.
	3358	* queue lock must be held here.
	3359	*/
	3360	static void cfq_put_queue(struct cfq_queue *cfqq)
	3361	{
	3362	struct cfq_data *cfqd = cfqq->cfqd;
	3363	struct cfq_group *cfqg;
	3364
	3365	BUG_ON(cfqq->ref <= 0);
	3366
	3367	cfqq->ref--;
	3368	if (cfqq->ref)
	3369	return;
	3370
	3371	cfq_log_cfqq(cfqd, cfqq, "put_queue");
	3372	BUG_ON(rb_first(&cfqq->sort_list));
	3373	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
	3374	cfqg = cfqq->cfqg;
	3375
	3376	if (unlikely(cfqd->active_queue == cfqq)) {
	3377	__cfq_slice_expired(cfqd, cfqq, 0);
	3378	cfq_schedule_dispatch(cfqd);
	3379	}
	3380
	3381	BUG_ON(cfq_cfqq_on_rr(cfqq));
	3382	kmem_cache_free(cfq_pool, cfqq);
	3383	cfqg_put(cfqg);
	3384	}
	3385
	3386	static void cfq_put_cooperator(struct cfq_queue *cfqq)
	3387	{
	3388	struct cfq_queue __cfqq, next;
	3389
	3390	/*
	3391	* If this queue was scheduled to merge with another queue, be
	3392	* sure to drop the reference taken on that queue (and others in
	3393	* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
	3394	*/
	3395	__cfqq = cfqq->new_cfqq;
	3396	while (__cfqq) {
	3397	if (__cfqq == cfqq) {
	3398	WARN(1, "cfqq->new_cfqq loop detected\n");
	3399	break;
	3400	}
	3401	next = __cfqq->new_cfqq;
	3402	cfq_put_queue(__cfqq);
	3403	__cfqq = next;
	3404	}
	3405	}
	3406
	3407	static void cfq_exit_cfqq(struct cfq_data cfqd, struct cfq_queue cfqq)
	3408	{
	3409	if (unlikely(cfqq == cfqd->active_queue)) {
	3410	__cfq_slice_expired(cfqd, cfqq, 0);
	3411	cfq_schedule_dispatch(cfqd);
	3412	}
	3413
	3414	cfq_put_cooperator(cfqq);
	3415
	3416	cfq_put_queue(cfqq);
	3417	}
	3418
	3419	static void cfq_init_icq(struct io_cq *icq)
	3420	{
	3421	struct cfq_io_cq *cic = icq_to_cic(icq);
	3422
	3423	cic->ttime.last_end_request = jiffies;
	3424	}
	3425
	3426	static void cfq_exit_icq(struct io_cq *icq)
	3427	{
	3428	struct cfq_io_cq *cic = icq_to_cic(icq);
	3429	struct cfq_data *cfqd = cic_to_cfqd(cic);
	3430
	3431	if (cic->cfqq[BLK_RW_ASYNC]) {
	3432	cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
	3433	cic->cfqq[BLK_RW_ASYNC] = NULL;
	3434	}
	3435
	3436	if (cic->cfqq[BLK_RW_SYNC]) {
	3437	cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
	3438	cic->cfqq[BLK_RW_SYNC] = NULL;
	3439	}
	3440	}
	3441
	3442	static void cfq_init_prio_data(struct cfq_queue cfqq, struct cfq_io_cq cic)
	3443	{
	3444	struct task_struct *tsk = current;
	3445	int ioprio_class;
	3446
	3447	if (!cfq_cfqq_prio_changed(cfqq))
	3448	return;
	3449
	3450	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
	3451	switch (ioprio_class) {
	3452	default:
	3453	printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
	3454	case IOPRIO_CLASS_NONE:
	3455	/*
	3456	* no prio set, inherit CPU scheduling settings
	3457	*/
	3458	cfqq->ioprio = task_nice_ioprio(tsk);
	3459	cfqq->ioprio_class = task_nice_ioclass(tsk);
	3460	break;
	3461	case IOPRIO_CLASS_RT:
	3462	cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
	3463	cfqq->ioprio_class = IOPRIO_CLASS_RT;
	3464	break;
	3465	case IOPRIO_CLASS_BE:
	3466	cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
	3467	cfqq->ioprio_class = IOPRIO_CLASS_BE;
	3468	break;
	3469	case IOPRIO_CLASS_IDLE:
	3470	cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
	3471	cfqq->ioprio = 7;
	3472	cfq_clear_cfqq_idle_window(cfqq);
	3473	break;
	3474	}
	3475
	3476	/*
	3477	* keep track of original prio settings in case we have to temporarily
	3478	* elevate the priority of this queue
	3479	*/
	3480	cfqq->org_ioprio = cfqq->ioprio;
	3481	cfq_clear_cfqq_prio_changed(cfqq);
	3482	}
	3483
	3484	static void check_ioprio_changed(struct cfq_io_cq cic, struct bio bio)
	3485	{
	3486	int ioprio = cic->icq.ioc->ioprio;
	3487	struct cfq_data *cfqd = cic_to_cfqd(cic);
	3488	struct cfq_queue *cfqq;
	3489
	3490	/*
	3491	* Check whether ioprio has changed. The condition may trigger
	3492	* spuriously on a newly created cic but there's no harm.
	3493	*/
	3494	if (unlikely(!cfqd) \|\| likely(cic->ioprio == ioprio))
	3495	return;
	3496
	3497	cfqq = cic->cfqq[BLK_RW_ASYNC];
	3498	if (cfqq) {
	3499	struct cfq_queue *new_cfqq;
	3500	new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
	3501	GFP_ATOMIC);
	3502	if (new_cfqq) {
	3503	cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
	3504	cfq_put_queue(cfqq);
	3505	}
	3506	}
	3507
	3508	cfqq = cic->cfqq[BLK_RW_SYNC];
	3509	if (cfqq)
	3510	cfq_mark_cfqq_prio_changed(cfqq);
	3511
	3512	cic->ioprio = ioprio;
	3513	}
	3514
	3515	static void cfq_init_cfqq(struct cfq_data cfqd, struct cfq_queue cfqq,
	3516	pid_t pid, bool is_sync)
	3517	{
	3518	RB_CLEAR_NODE(&cfqq->rb_node);
	3519	RB_CLEAR_NODE(&cfqq->p_node);
	3520	INIT_LIST_HEAD(&cfqq->fifo);
	3521
	3522	cfqq->ref = 0;
	3523	cfqq->cfqd = cfqd;
	3524
	3525	cfq_mark_cfqq_prio_changed(cfqq);
	3526
	3527	if (is_sync) {
	3528	if (!cfq_class_idle(cfqq))
	3529	cfq_mark_cfqq_idle_window(cfqq);
	3530	cfq_mark_cfqq_sync(cfqq);
	3531	}
	3532	cfqq->pid = pid;
	3533	}
	3534
	3535	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	3536	static void check_blkcg_changed(struct cfq_io_cq cic, struct bio bio)
	3537	{
	3538	struct cfq_data *cfqd = cic_to_cfqd(cic);
	3539	struct cfq_queue *sync_cfqq;
	3540	uint64_t id;
	3541
	3542	rcu_read_lock();
	3543	id = bio_blkcg(bio)->id;
	3544	rcu_read_unlock();
	3545
	3546	/*
	3547	* Check whether blkcg has changed. The condition may trigger
	3548	* spuriously on a newly created cic but there's no harm.
	3549	*/
	3550	if (unlikely(!cfqd) \|\| likely(cic->blkcg_id == id))
	3551	return;
	3552
	3553	sync_cfqq = cic_to_cfqq(cic, 1);
	3554	if (sync_cfqq) {
	3555	/*
	3556	* Drop reference to sync queue. A new sync queue will be
	3557	* assigned in new group upon arrival of a fresh request.
	3558	*/
	3559	cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
	3560	cic_set_cfqq(cic, NULL, 1);
	3561	cfq_put_queue(sync_cfqq);
	3562	}
	3563
	3564	cic->blkcg_id = id;
	3565	}
	3566	#else
	3567	static inline void check_blkcg_changed(struct cfq_io_cq cic, struct bio bio) { }
	3568	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
	3569
	3570	static struct cfq_queue *
	3571	cfq_find_alloc_queue(struct cfq_data cfqd, bool is_sync, struct cfq_io_cq cic,
	3572	struct bio *bio, gfp_t gfp_mask)
	3573	{
	3574	struct blkcg *blkcg;
	3575	struct cfq_queue cfqq, new_cfqq = NULL;
	3576	struct cfq_group *cfqg;
	3577
	3578	retry:
	3579	rcu_read_lock();
	3580
	3581	blkcg = bio_blkcg(bio);
	3582	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
	3583	cfqq = cic_to_cfqq(cic, is_sync);
	3584
	3585	/*
	3586	* Always try a new alloc if we fell back to the OOM cfqq
	3587	* originally, since it should just be a temporary situation.
	3588	*/
	3589	if (!cfqq \|\| cfqq == &cfqd->oom_cfqq) {
	3590	cfqq = NULL;
	3591	if (new_cfqq) {
	3592	cfqq = new_cfqq;
	3593	new_cfqq = NULL;
	3594	} else if (gfp_mask & __GFP_WAIT) {
	3595	rcu_read_unlock();
	3596	spin_unlock_irq(cfqd->queue->queue_lock);
	3597	new_cfqq = kmem_cache_alloc_node(cfq_pool,
	3598	gfp_mask \| __GFP_ZERO,
	3599	cfqd->queue->node);
	3600	spin_lock_irq(cfqd->queue->queue_lock);
	3601	if (new_cfqq)
	3602	goto retry;
	3603	else
	3604	return &cfqd->oom_cfqq;
	3605	} else {
	3606	cfqq = kmem_cache_alloc_node(cfq_pool,
	3607	gfp_mask \| __GFP_ZERO,
	3608	cfqd->queue->node);
	3609	}
	3610
	3611	if (cfqq) {
	3612	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
	3613	cfq_init_prio_data(cfqq, cic);
	3614	cfq_link_cfqq_cfqg(cfqq, cfqg);
	3615	cfq_log_cfqq(cfqd, cfqq, "alloced");
	3616	} else
	3617	cfqq = &cfqd->oom_cfqq;
	3618	}
	3619
	3620	if (new_cfqq)
	3621	kmem_cache_free(cfq_pool, new_cfqq);
	3622
	3623	rcu_read_unlock();
	3624	return cfqq;
	3625	}
	3626
	3627	static struct cfq_queue **
	3628	cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
	3629	{
	3630	switch (ioprio_class) {
	3631	case IOPRIO_CLASS_RT:
	3632	return &cfqd->async_cfqq[0][ioprio];
	3633	case IOPRIO_CLASS_NONE:
	3634	ioprio = IOPRIO_NORM;
	3635	/* fall through */
	3636	case IOPRIO_CLASS_BE:
	3637	return &cfqd->async_cfqq[1][ioprio];
	3638	case IOPRIO_CLASS_IDLE:
	3639	return &cfqd->async_idle_cfqq;
	3640	default:
	3641	BUG();
	3642	}
	3643	}
	3644
	3645	static struct cfq_queue *
	3646	cfq_get_queue(struct cfq_data cfqd, bool is_sync, struct cfq_io_cq cic,
	3647	struct bio *bio, gfp_t gfp_mask)
	3648	{
	3649	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
	3650	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
	3651	struct cfq_queue **async_cfqq = NULL;
	3652	struct cfq_queue *cfqq = NULL;
	3653
	3654	if (!is_sync) {
	3655	async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
	3656	cfqq = *async_cfqq;
	3657	}
	3658
	3659	if (!cfqq)
	3660	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
	3661
	3662	/*
	3663	* pin the queue now that it's allocated, scheduler exit will prune it
	3664	*/
	3665	if (!is_sync && !(*async_cfqq)) {
	3666	cfqq->ref++;
	3667	*async_cfqq = cfqq;
	3668	}
	3669
	3670	cfqq->ref++;
	3671	return cfqq;
	3672	}
	3673
	3674	static void
	3675	__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
	3676	{
	3677	unsigned long elapsed = jiffies - ttime->last_end_request;
	3678	elapsed = min(elapsed, 2UL * slice_idle);
	3679
	3680	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
	3681	ttime->ttime_total = (7ttime->ttime_total + 256elapsed) / 8;
	3682	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
	3683	}
	3684
	3685	static void
	3686	cfq_update_io_thinktime(struct cfq_data cfqd, struct cfq_queue cfqq,
	3687	struct cfq_io_cq *cic)
	3688	{
	3689	if (cfq_cfqq_sync(cfqq)) {
	3690	__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
	3691	__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
	3692	cfqd->cfq_slice_idle);
	3693	}
	3694	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	3695	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
	3696	#endif
	3697	}
	3698
	3699	static void
	3700	cfq_update_io_seektime(struct cfq_data cfqd, struct cfq_queue cfqq,
	3701	struct request *rq)
	3702	{
	3703	sector_t sdist = 0;
	3704	sector_t n_sec = blk_rq_sectors(rq);
	3705	if (cfqq->last_request_pos) {
	3706	if (cfqq->last_request_pos < blk_rq_pos(rq))
	3707	sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
	3708	else
	3709	sdist = cfqq->last_request_pos - blk_rq_pos(rq);
	3710	}
	3711
	3712	cfqq->seek_history <<= 1;
	3713	if (blk_queue_nonrot(cfqd->queue))
	3714	cfqq->seek_history \|= (n_sec < CFQQ_SECT_THR_NONROT);
	3715	else
	3716	cfqq->seek_history \|= (sdist > CFQQ_SEEK_THR);
	3717	}
	3718
	3719	/*
	3720	* Disable idle window if the process thinks too long or seeks so much that
	3721	* it doesn't matter
	3722	*/
	3723	static void
	3724	cfq_update_idle_window(struct cfq_data cfqd, struct cfq_queue cfqq,
	3725	struct cfq_io_cq *cic)
	3726	{
	3727	int old_idle, enable_idle;
	3728
	3729	/*
	3730	* Don't idle for async or idle io prio class
	3731	*/
	3732	if (!cfq_cfqq_sync(cfqq) \|\| cfq_class_idle(cfqq))
	3733	return;
	3734
	3735	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
	3736
	3737	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
	3738	cfq_mark_cfqq_deep(cfqq);
	3739
	3740	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
	3741	enable_idle = 0;
	3742	else if (!atomic_read(&cic->icq.ioc->active_ref) \|\|
	3743	!cfqd->cfq_slice_idle \|\|
	3744	(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
	3745	enable_idle = 0;
	3746	else if (sample_valid(cic->ttime.ttime_samples)) {
	3747	if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
	3748	enable_idle = 0;
	3749	else
	3750	enable_idle = 1;
	3751	}
	3752
	3753	if (old_idle != enable_idle) {
	3754	cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
	3755	if (enable_idle)
	3756	cfq_mark_cfqq_idle_window(cfqq);
	3757	else
	3758	cfq_clear_cfqq_idle_window(cfqq);
	3759	}
	3760	}
	3761
	3762	/*
	3763	* Check if new_cfqq should preempt the currently active queue. Return 0 for
	3764	* no or if we aren't sure, a 1 will cause a preempt.
	3765	*/
	3766	static bool
	3767	cfq_should_preempt(struct cfq_data cfqd, struct cfq_queue new_cfqq,
	3768	struct request *rq)
	3769	{
	3770	struct cfq_queue *cfqq;
	3771
	3772	cfqq = cfqd->active_queue;
	3773	if (!cfqq)
	3774	return false;
	3775
	3776	if (cfq_class_idle(new_cfqq))
	3777	return false;
	3778
	3779	if (cfq_class_idle(cfqq))
	3780	return true;
	3781
	3782	/*
	3783	* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
	3784	*/
	3785	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
	3786	return false;
	3787
	3788	/*
	3789	* if the new request is sync, but the currently running queue is
	3790	* not, let the sync request have priority.
	3791	*/
	3792	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
	3793	return true;
	3794
	3795	if (new_cfqq->cfqg != cfqq->cfqg)
	3796	return false;
	3797
	3798	if (cfq_slice_used(cfqq))
	3799	return true;
	3800
	3801	/* Allow preemption only if we are idling on sync-noidle tree */
	3802	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
	3803	cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
	3804	new_cfqq->service_tree->count == 2 &&
	3805	RB_EMPTY_ROOT(&cfqq->sort_list))
	3806	return true;
	3807
	3808	/*
	3809	* So both queues are sync. Let the new request get disk time if
	3810	* it's a metadata request and the current queue is doing regular IO.
	3811	*/
	3812	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
	3813	return true;
	3814
	3815	/*
	3816	* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
	3817	*/
	3818	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
	3819	return true;
	3820
	3821	/* An idle queue should not be idle now for some reason */
	3822	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
	3823	return true;
	3824
	3825	if (!cfqd->active_cic \|\| !cfq_cfqq_wait_request(cfqq))
	3826	return false;
	3827
	3828	/*
	3829	* if this request is as-good as one we would expect from the
	3830	* current cfqq, let it preempt
	3831	*/
	3832	if (cfq_rq_close(cfqd, cfqq, rq))
	3833	return true;
	3834
	3835	return false;
	3836	}
	3837
	3838	/*
	3839	* cfqq preempts the active queue. if we allowed preempt with no slice left,
	3840	* let it have half of its nominal slice.
	3841	*/
	3842	static void cfq_preempt_queue(struct cfq_data cfqd, struct cfq_queue cfqq)
	3843	{
	3844	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
	3845
	3846	cfq_log_cfqq(cfqd, cfqq, "preempt");
	3847	cfq_slice_expired(cfqd, 1);
	3848
	3849	/*
	3850	* workload type is changed, don't save slice, otherwise preempt
	3851	* doesn't happen
	3852	*/
	3853	if (old_type != cfqq_type(cfqq))
	3854	cfqq->cfqg->saved_wl_slice = 0;
	3855
	3856	/*
	3857	* Put the new queue at the front of the of the current list,
	3858	* so we know that it will be selected next.
	3859	*/
	3860	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	3861
	3862	cfq_service_tree_add(cfqd, cfqq, 1);
	3863
	3864	cfqq->slice_end = 0;
	3865	cfq_mark_cfqq_slice_new(cfqq);
	3866	}
	3867
	3868	/*
	3869	* Called when a new fs request (rq) is added (to cfqq). Check if there's
	3870	* something we should do about it
	3871	*/
	3872	static void
	3873	cfq_rq_enqueued(struct cfq_data cfqd, struct cfq_queue cfqq,
	3874	struct request *rq)
	3875	{
	3876	struct cfq_io_cq *cic = RQ_CIC(rq);
	3877
	3878	cfqd->rq_queued++;
	3879	if (rq->cmd_flags & REQ_PRIO)
	3880	cfqq->prio_pending++;
	3881
	3882	cfq_update_io_thinktime(cfqd, cfqq, cic);
	3883	cfq_update_io_seektime(cfqd, cfqq, rq);
	3884	cfq_update_idle_window(cfqd, cfqq, cic);
	3885
	3886	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
	3887
	3888	if (cfqq == cfqd->active_queue) {
	3889	/*
	3890	* Remember that we saw a request from this process, but
	3891	* don't start queuing just yet. Otherwise we risk seeing lots
	3892	* of tiny requests, because we disrupt the normal plugging
	3893	* and merging. If the request is already larger than a single
	3894	* page, let it rip immediately. For that case we assume that
	3895	* merging is already done. Ditto for a busy system that
	3896	* has other work pending, don't risk delaying until the
	3897	* idle timer unplug to continue working.
	3898	*/
	3899	if (cfq_cfqq_wait_request(cfqq)) {
	3900	if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE \|\|
	3901	cfqd->busy_queues > 1) {
	3902	cfq_del_timer(cfqd, cfqq);
	3903	cfq_clear_cfqq_wait_request(cfqq);
	3904	__blk_run_queue(cfqd->queue);
	3905	} else {
	3906	cfqg_stats_update_idle_time(cfqq->cfqg);
	3907	cfq_mark_cfqq_must_dispatch(cfqq);
	3908	}
	3909	}
	3910	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
	3911	/*
	3912	* not the active queue - expire current slice if it is
	3913	* idle and has expired it's mean thinktime or this new queue
	3914	* has some old slice time left and is of higher priority or
	3915	* this new queue is RT and the current one is BE
	3916	*/
	3917	cfq_preempt_queue(cfqd, cfqq);
	3918	__blk_run_queue(cfqd->queue);
	3919	}
	3920	}
	3921
	3922	static void cfq_insert_request(struct request_queue q, struct request rq)
	3923	{
	3924	struct cfq_data *cfqd = q->elevator->elevator_data;
	3925	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	3926
	3927	cfq_log_cfqq(cfqd, cfqq, "insert_request");
	3928	cfq_init_prio_data(cfqq, RQ_CIC(rq));
	3929
	3930	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
	3931	list_add_tail(&rq->queuelist, &cfqq->fifo);
	3932	cfq_add_rq_rb(rq);
	3933	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
	3934	rq->cmd_flags);
	3935	cfq_rq_enqueued(cfqd, cfqq, rq);
	3936	}
	3937
	3938	/*
	3939	* Update hw_tag based on peak queue depth over 50 samples under
	3940	* sufficient load.
	3941	*/
	3942	static void cfq_update_hw_tag(struct cfq_data *cfqd)
	3943	{
	3944	struct cfq_queue *cfqq = cfqd->active_queue;
	3945
	3946	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
	3947	cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
	3948
	3949	if (cfqd->hw_tag == 1)
	3950	return;
	3951
	3952	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
	3953	cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
	3954	return;
	3955
	3956	/*
	3957	* If active queue hasn't enough requests and can idle, cfq might not
	3958	* dispatch sufficient requests to hardware. Don't zero hw_tag in this
	3959	* case
	3960	*/
	3961	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
	3962	cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
	3963	CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
	3964	return;
	3965
	3966	if (cfqd->hw_tag_samples++ < 50)
	3967	return;
	3968
	3969	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
	3970	cfqd->hw_tag = 1;
	3971	else
	3972	cfqd->hw_tag = 0;
	3973	}
	3974
	3975	static bool cfq_should_wait_busy(struct cfq_data cfqd, struct cfq_queue cfqq)
	3976	{
	3977	struct cfq_io_cq *cic = cfqd->active_cic;
	3978
	3979	/* If the queue already has requests, don't wait */
	3980	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
	3981	return false;
	3982
	3983	/* If there are other queues in the group, don't wait */
	3984	if (cfqq->cfqg->nr_cfqq > 1)
	3985	return false;
	3986
	3987	/* the only queue in the group, but think time is big */
	3988	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
	3989	return false;
	3990
	3991	if (cfq_slice_used(cfqq))
	3992	return true;
	3993
	3994	/* if slice left is less than think time, wait busy */
	3995	if (cic && sample_valid(cic->ttime.ttime_samples)
	3996	&& (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
	3997	return true;
	3998
	3999	/*
	4000	* If think times is less than a jiffy than ttime_mean=0 and above
	4001	* will not be true. It might happen that slice has not expired yet
	4002	* but will expire soon (4-5 ns) during select_queue(). To cover the
	4003	* case where think time is less than a jiffy, mark the queue wait
	4004	* busy if only 1 jiffy is left in the slice.
	4005	*/
	4006	if (cfqq->slice_end - jiffies == 1)
	4007	return true;
	4008
	4009	return false;
	4010	}
	4011
	4012	static void cfq_completed_request(struct request_queue q, struct request rq)
	4013	{
	4014	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	4015	struct cfq_data *cfqd = cfqq->cfqd;
	4016	const int sync = rq_is_sync(rq);
	4017	unsigned long now;
	4018
	4019	now = jiffies;
	4020	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
	4021	!!(rq->cmd_flags & REQ_NOIDLE));
	4022
	4023	cfq_update_hw_tag(cfqd);
	4024
	4025	WARN_ON(!cfqd->rq_in_driver);
	4026	WARN_ON(!cfqq->dispatched);
	4027	cfqd->rq_in_driver--;
	4028	cfqq->dispatched--;
	4029	(RQ_CFQG(rq))->dispatched--;
	4030	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
	4031	rq_io_start_time_ns(rq), rq->cmd_flags);
	4032
	4033	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
	4034
	4035	if (sync) {
	4036	struct cfq_rb_root *st;
	4037
	4038	RQ_CIC(rq)->ttime.last_end_request = now;
	4039
	4040	if (cfq_cfqq_on_rr(cfqq))
	4041	st = cfqq->service_tree;
	4042	else
	4043	st = st_for(cfqq->cfqg, cfqq_class(cfqq),
	4044	cfqq_type(cfqq));
	4045
	4046	st->ttime.last_end_request = now;
	4047	if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
	4048	cfqd->last_delayed_sync = now;
	4049	}
	4050
	4051	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4052	cfqq->cfqg->ttime.last_end_request = now;
	4053	#endif
	4054
	4055	/*
	4056	* If this is the active queue, check if it needs to be expired,
	4057	* or if we want to idle in case it has no pending requests.
	4058	*/
	4059	if (cfqd->active_queue == cfqq) {
	4060	const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
	4061
	4062	if (cfq_cfqq_slice_new(cfqq)) {
	4063	cfq_set_prio_slice(cfqd, cfqq);
	4064	cfq_clear_cfqq_slice_new(cfqq);
	4065	}
	4066
	4067	/*
	4068	* Should we wait for next request to come in before we expire
	4069	* the queue.
	4070	*/
	4071	if (cfq_should_wait_busy(cfqd, cfqq)) {
	4072	unsigned long extend_sl = cfqd->cfq_slice_idle;
	4073	if (!cfqd->cfq_slice_idle)
	4074	extend_sl = cfqd->cfq_group_idle;
	4075	cfqq->slice_end = jiffies + extend_sl;
	4076	cfq_mark_cfqq_wait_busy(cfqq);
	4077	cfq_log_cfqq(cfqd, cfqq, "will busy wait");
	4078	}
	4079
	4080	/*
	4081	* Idling is not enabled on:
	4082	* - expired queues
	4083	* - idle-priority queues
	4084	* - async queues
	4085	* - queues with still some requests queued
	4086	* - when there is a close cooperator
	4087	*/
	4088	if (cfq_slice_used(cfqq) \|\| cfq_class_idle(cfqq))
	4089	cfq_slice_expired(cfqd, 1);
	4090	else if (sync && cfqq_empty &&
	4091	!cfq_close_cooperator(cfqd, cfqq)) {
	4092	cfq_arm_slice_timer(cfqd);
	4093	}
	4094	}
	4095
	4096	if (!cfqd->rq_in_driver)
	4097	cfq_schedule_dispatch(cfqd);
	4098	}
	4099
	4100	static inline int __cfq_may_queue(struct cfq_queue *cfqq)
	4101	{
	4102	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
	4103	cfq_mark_cfqq_must_alloc_slice(cfqq);
	4104	return ELV_MQUEUE_MUST;
	4105	}
	4106
	4107	return ELV_MQUEUE_MAY;
	4108	}
	4109
	4110	static int cfq_may_queue(struct request_queue *q, int rw)
	4111	{
	4112	struct cfq_data *cfqd = q->elevator->elevator_data;
	4113	struct task_struct *tsk = current;
	4114	struct cfq_io_cq *cic;
	4115	struct cfq_queue *cfqq;
	4116
	4117	/*
	4118	* don't force setup of a queue from here, as a call to may_queue
	4119	* does not necessarily imply that a request actually will be queued.
	4120	* so just lookup a possibly existing queue, or return 'may queue'
	4121	* if that fails
	4122	*/
	4123	cic = cfq_cic_lookup(cfqd, tsk->io_context);
	4124	if (!cic)
	4125	return ELV_MQUEUE_MAY;
	4126
	4127	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
	4128	if (cfqq) {
	4129	cfq_init_prio_data(cfqq, cic);
	4130
	4131	return __cfq_may_queue(cfqq);
	4132	}
	4133
	4134	return ELV_MQUEUE_MAY;
	4135	}
	4136
	4137	/*
	4138	* queue lock held here
	4139	*/
	4140	static void cfq_put_request(struct request *rq)
	4141	{
	4142	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	4143
	4144	if (cfqq) {
	4145	const int rw = rq_data_dir(rq);
	4146
	4147	BUG_ON(!cfqq->allocated[rw]);
	4148	cfqq->allocated[rw]--;
	4149
	4150	/* Put down rq reference on cfqg */
	4151	cfqg_put(RQ_CFQG(rq));
	4152	rq->elv.priv[0] = NULL;
	4153	rq->elv.priv[1] = NULL;
	4154
	4155	cfq_put_queue(cfqq);
	4156	}
	4157	}
	4158
	4159	static struct cfq_queue *
	4160	cfq_merge_cfqqs(struct cfq_data cfqd, struct cfq_io_cq cic,
	4161	struct cfq_queue *cfqq)
	4162	{
	4163	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
	4164	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
	4165	cfq_mark_cfqq_coop(cfqq->new_cfqq);
	4166	cfq_put_queue(cfqq);
	4167	return cic_to_cfqq(cic, 1);
	4168	}
	4169
	4170	/*
	4171	* Returns NULL if a new cfqq should be allocated, or the old cfqq if this
	4172	* was the last process referring to said cfqq.
	4173	*/
	4174	static struct cfq_queue *
	4175	split_cfqq(struct cfq_io_cq cic, struct cfq_queue cfqq)
	4176	{
	4177	if (cfqq_process_refs(cfqq) == 1) {
	4178	cfqq->pid = current->pid;
	4179	cfq_clear_cfqq_coop(cfqq);
	4180	cfq_clear_cfqq_split_coop(cfqq);
	4181	return cfqq;
	4182	}
	4183
	4184	cic_set_cfqq(cic, NULL, 1);
	4185
	4186	cfq_put_cooperator(cfqq);
	4187
	4188	cfq_put_queue(cfqq);
	4189	return NULL;
	4190	}
	4191	/*
	4192	* Allocate cfq data structures associated with this request.
	4193	*/
	4194	static int
	4195	cfq_set_request(struct request_queue q, struct request rq, struct bio *bio,
	4196	gfp_t gfp_mask)
	4197	{
	4198	struct cfq_data *cfqd = q->elevator->elevator_data;
	4199	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
	4200	const int rw = rq_data_dir(rq);
	4201	const bool is_sync = rq_is_sync(rq);
	4202	struct cfq_queue *cfqq;
	4203
	4204	might_sleep_if(gfp_mask & __GFP_WAIT);
	4205
	4206	spin_lock_irq(q->queue_lock);
	4207
	4208	check_ioprio_changed(cic, bio);
	4209	check_blkcg_changed(cic, bio);
	4210	new_queue:
	4211	cfqq = cic_to_cfqq(cic, is_sync);
	4212	if (!cfqq \|\| cfqq == &cfqd->oom_cfqq) {
	4213	cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
	4214	cic_set_cfqq(cic, cfqq, is_sync);
	4215	} else {
	4216	/*
	4217	* If the queue was seeky for too long, break it apart.
	4218	*/
	4219	if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
	4220	cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
	4221	cfqq = split_cfqq(cic, cfqq);
	4222	if (!cfqq)
	4223	goto new_queue;
	4224	}
	4225
	4226	/*
	4227	* Check to see if this queue is scheduled to merge with
	4228	* another, closely cooperating queue. The merging of
	4229	* queues happens here as it must be done in process context.
	4230	* The reference on new_cfqq was taken in merge_cfqqs.
	4231	*/
	4232	if (cfqq->new_cfqq)
	4233	cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
	4234	}
	4235
	4236	cfqq->allocated[rw]++;
	4237
	4238	cfqq->ref++;
	4239	cfqg_get(cfqq->cfqg);
	4240	rq->elv.priv[0] = cfqq;
	4241	rq->elv.priv[1] = cfqq->cfqg;
	4242	spin_unlock_irq(q->queue_lock);
	4243	return 0;
	4244	}
	4245
	4246	static void cfq_kick_queue(struct work_struct *work)
	4247	{
	4248	struct cfq_data *cfqd =
	4249	container_of(work, struct cfq_data, unplug_work);
	4250	struct request_queue *q = cfqd->queue;
	4251
	4252	spin_lock_irq(q->queue_lock);
	4253	__blk_run_queue(cfqd->queue);
	4254	spin_unlock_irq(q->queue_lock);
	4255	}
	4256
	4257	/*
	4258	* Timer running if the active_queue is currently idling inside its time slice
	4259	*/
	4260	static void cfq_idle_slice_timer(unsigned long data)
	4261	{
	4262	struct cfq_data cfqd = (struct cfq_data ) data;
	4263	struct cfq_queue *cfqq;
	4264	unsigned long flags;
	4265	int timed_out = 1;
	4266
	4267	cfq_log(cfqd, "idle timer fired");
	4268
	4269	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
	4270
	4271	cfqq = cfqd->active_queue;
	4272	if (cfqq) {
	4273	timed_out = 0;
	4274
	4275	/*
	4276	* We saw a request before the queue expired, let it through
	4277	*/
	4278	if (cfq_cfqq_must_dispatch(cfqq))
	4279	goto out_kick;
	4280
	4281	/*
	4282	* expired
	4283	*/
	4284	if (cfq_slice_used(cfqq))
	4285	goto expire;
	4286
	4287	/*
	4288	* only expire and reinvoke request handler, if there are
	4289	* other queues with pending requests
	4290	*/
	4291	if (!cfqd->busy_queues)
	4292	goto out_cont;
	4293
	4294	/*
	4295	* not expired and it has a request pending, let it dispatch
	4296	*/
	4297	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
	4298	goto out_kick;
	4299
	4300	/*
	4301	* Queue depth flag is reset only when the idle didn't succeed
	4302	*/
	4303	cfq_clear_cfqq_deep(cfqq);
	4304	}
	4305	expire:
	4306	cfq_slice_expired(cfqd, timed_out);
	4307	out_kick:
	4308	cfq_schedule_dispatch(cfqd);
	4309	out_cont:
	4310	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
	4311	}
	4312
	4313	static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
	4314	{
	4315	del_timer_sync(&cfqd->idle_slice_timer);
	4316	cancel_work_sync(&cfqd->unplug_work);
	4317	}
	4318
	4319	static void cfq_put_async_queues(struct cfq_data *cfqd)
	4320	{
	4321	int i;
	4322
	4323	for (i = 0; i < IOPRIO_BE_NR; i++) {
	4324	if (cfqd->async_cfqq[0][i])
	4325	cfq_put_queue(cfqd->async_cfqq[0][i]);
	4326	if (cfqd->async_cfqq[1][i])
	4327	cfq_put_queue(cfqd->async_cfqq[1][i]);
	4328	}
	4329
	4330	if (cfqd->async_idle_cfqq)
	4331	cfq_put_queue(cfqd->async_idle_cfqq);
	4332	}
	4333
	4334	static void cfq_exit_queue(struct elevator_queue *e)
	4335	{
	4336	struct cfq_data *cfqd = e->elevator_data;
	4337	struct request_queue *q = cfqd->queue;
	4338
	4339	cfq_shutdown_timer_wq(cfqd);
	4340
	4341	spin_lock_irq(q->queue_lock);
	4342
	4343	if (cfqd->active_queue)
	4344	__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
	4345
	4346	cfq_put_async_queues(cfqd);
	4347
	4348	spin_unlock_irq(q->queue_lock);
	4349
	4350	cfq_shutdown_timer_wq(cfqd);
	4351
	4352	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4353	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
	4354	#else
	4355	kfree(cfqd->root_group);
	4356	#endif
	4357	kfree(cfqd);
	4358	}
	4359
	4360	static int cfq_init_queue(struct request_queue q, struct elevator_type e)
	4361	{
	4362	struct cfq_data *cfqd;
	4363	struct blkcg_gq *blkg __maybe_unused;
	4364	int i, ret;
	4365	struct elevator_queue *eq;
	4366
	4367	eq = elevator_alloc(q, e);
	4368	if (!eq)
	4369	return -ENOMEM;
	4370
	4371	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
	4372	if (!cfqd) {
	4373	kobject_put(&eq->kobj);
	4374	return -ENOMEM;
	4375	}
	4376	eq->elevator_data = cfqd;
	4377
	4378	cfqd->queue = q;
	4379	spin_lock_irq(q->queue_lock);
	4380	q->elevator = eq;
	4381	spin_unlock_irq(q->queue_lock);
	4382
	4383	/* Init root service tree */
	4384	cfqd->grp_service_tree = CFQ_RB_ROOT;
	4385
	4386	/* Init root group and prefer root group over other groups by default */
	4387	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4388	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
	4389	if (ret)
	4390	goto out_free;
	4391
	4392	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
	4393	#else
	4394	ret = -ENOMEM;
	4395	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
	4396	GFP_KERNEL, cfqd->queue->node);
	4397	if (!cfqd->root_group)
	4398	goto out_free;
	4399
	4400	cfq_init_cfqg_base(cfqd->root_group);
	4401	#endif
	4402	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
	4403	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
	4404
	4405	/*
	4406	* Not strictly needed (since RB_ROOT just clears the node and we
	4407	* zeroed cfqd on alloc), but better be safe in case someone decides
	4408	* to add magic to the rb code
	4409	*/
	4410	for (i = 0; i < CFQ_PRIO_LISTS; i++)
	4411	cfqd->prio_trees[i] = RB_ROOT;
	4412
	4413	/*
	4414	* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
	4415	* Grab a permanent reference to it, so that the normal code flow
	4416	* will not attempt to free it. oom_cfqq is linked to root_group
	4417	* but shouldn't hold a reference as it'll never be unlinked. Lose
	4418	* the reference from linking right away.
	4419	*/
	4420	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
	4421	cfqd->oom_cfqq.ref++;
	4422
	4423	spin_lock_irq(q->queue_lock);
	4424	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
	4425	cfqg_put(cfqd->root_group);
	4426	spin_unlock_irq(q->queue_lock);
	4427
	4428	init_timer(&cfqd->idle_slice_timer);
	4429	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
	4430	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
	4431
	4432	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
	4433
	4434	cfqd->cfq_quantum = cfq_quantum;
	4435	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
	4436	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
	4437	cfqd->cfq_back_max = cfq_back_max;
	4438	cfqd->cfq_back_penalty = cfq_back_penalty;
	4439	cfqd->cfq_slice[0] = cfq_slice_async;
	4440	cfqd->cfq_slice[1] = cfq_slice_sync;
	4441	cfqd->cfq_target_latency = cfq_target_latency;
	4442	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
	4443	cfqd->cfq_slice_idle = cfq_slice_idle;
	4444	cfqd->cfq_group_idle = cfq_group_idle;
	4445	cfqd->cfq_latency = 1;
	4446	cfqd->hw_tag = -1;
	4447	/*
	4448	* we optimistically start assuming sync ops weren't delayed in last
	4449	* second, in order to have larger depth for async operations.
	4450	*/
	4451	cfqd->last_delayed_sync = jiffies - HZ;
	4452	return 0;
	4453
	4454	out_free:
	4455	kfree(cfqd);
	4456	kobject_put(&eq->kobj);
	4457	return ret;
	4458	}
	4459
	4460	/*
	4461	* sysfs parts below -->
	4462	*/
	4463	static ssize_t
	4464	cfq_var_show(unsigned int var, char *page)
	4465	{
	4466	return sprintf(page, "%d\n", var);
	4467	}
	4468
	4469	static ssize_t
	4470	cfq_var_store(unsigned int var, const char page, size_t count)
	4471	{
	4472	char p = (char ) page;
	4473
	4474	*var = simple_strtoul(p, &p, 10);
	4475	return count;
	4476	}
	4477
	4478	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
	4479	static ssize_t __FUNC(struct elevator_queue e, char page) \
	4480	{ \
	4481	struct cfq_data *cfqd = e->elevator_data; \
	4482	unsigned int __data = __VAR; \
	4483	if (__CONV) \
	4484	__data = jiffies_to_msecs(__data); \
	4485	return cfq_var_show(__data, (page)); \
	4486	}
	4487	SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
	4488	SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
	4489	SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
	4490	SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
	4491	SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
	4492	SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
	4493	SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
	4494	SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
	4495	SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
	4496	SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
	4497	SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
	4498	SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
	4499	#undef SHOW_FUNCTION
	4500
	4501	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
	4502	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count) \
	4503	{ \
	4504	struct cfq_data *cfqd = e->elevator_data; \
	4505	unsigned int __data; \
	4506	int ret = cfq_var_store(&__data, (page), count); \
	4507	if (__data < (MIN)) \
	4508	__data = (MIN); \
	4509	else if (__data > (MAX)) \
	4510	__data = (MAX); \
	4511	if (__CONV) \
	4512	*(__PTR) = msecs_to_jiffies(__data); \
	4513	else \
	4514	*(__PTR) = __data; \
	4515	return ret; \
	4516	}
	4517	STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
	4518	STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
	4519	UINT_MAX, 1);
	4520	STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
	4521	UINT_MAX, 1);
	4522	STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
	4523	STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
	4524	UINT_MAX, 0);
	4525	STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
	4526	STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
	4527	STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
	4528	STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
	4529	STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
	4530	UINT_MAX, 0);
	4531	STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
	4532	STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
	4533	#undef STORE_FUNCTION
	4534
	4535	#define CFQ_ATTR(name) \
	4536	__ATTR(name, S_IRUGO\|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
	4537
	4538	static struct elv_fs_entry cfq_attrs[] = {
	4539	CFQ_ATTR(quantum),
	4540	CFQ_ATTR(fifo_expire_sync),
	4541	CFQ_ATTR(fifo_expire_async),
	4542	CFQ_ATTR(back_seek_max),
	4543	CFQ_ATTR(back_seek_penalty),
	4544	CFQ_ATTR(slice_sync),
	4545	CFQ_ATTR(slice_async),
	4546	CFQ_ATTR(slice_async_rq),
	4547	CFQ_ATTR(slice_idle),
	4548	CFQ_ATTR(group_idle),
	4549	CFQ_ATTR(low_latency),
	4550	CFQ_ATTR(target_latency),
	4551	__ATTR_NULL
	4552	};
	4553
	4554	static struct elevator_type iosched_cfq = {
	4555	.ops = {
	4556	.elevator_merge_fn = cfq_merge,
	4557	.elevator_merged_fn = cfq_merged_request,
	4558	.elevator_merge_req_fn = cfq_merged_requests,
	4559	.elevator_allow_merge_fn = cfq_allow_merge,
	4560	.elevator_bio_merged_fn = cfq_bio_merged,
	4561	.elevator_dispatch_fn = cfq_dispatch_requests,
	4562	.elevator_add_req_fn = cfq_insert_request,
	4563	.elevator_activate_req_fn = cfq_activate_request,
	4564	.elevator_deactivate_req_fn = cfq_deactivate_request,
	4565	.elevator_completed_req_fn = cfq_completed_request,
	4566	.elevator_former_req_fn = elv_rb_former_request,
	4567	.elevator_latter_req_fn = elv_rb_latter_request,
	4568	.elevator_init_icq_fn = cfq_init_icq,
	4569	.elevator_exit_icq_fn = cfq_exit_icq,
	4570	.elevator_set_req_fn = cfq_set_request,
	4571	.elevator_put_req_fn = cfq_put_request,
	4572	.elevator_may_queue_fn = cfq_may_queue,
	4573	.elevator_init_fn = cfq_init_queue,
	4574	.elevator_exit_fn = cfq_exit_queue,
	4575	},
	4576	.icq_size = sizeof(struct cfq_io_cq),
	4577	.icq_align = __alignof__(struct cfq_io_cq),
	4578	.elevator_attrs = cfq_attrs,
	4579	.elevator_name = "cfq",
	4580	.elevator_owner = THIS_MODULE,
	4581	};
	4582
	4583	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4584	static struct blkcg_policy blkcg_policy_cfq = {
	4585	.pd_size = sizeof(struct cfq_group),
	4586	.cftypes = cfq_blkcg_files,
	4587
	4588	.pd_init_fn = cfq_pd_init,
	4589	.pd_offline_fn = cfq_pd_offline,
	4590	.pd_reset_stats_fn = cfq_pd_reset_stats,
	4591	};
	4592	#endif
	4593
	4594	static int __init cfq_init(void)
	4595	{
	4596	int ret;
	4597
	4598	/*
	4599	* could be 0 on HZ < 1000 setups
	4600	*/
	4601	if (!cfq_slice_async)
	4602	cfq_slice_async = 1;
	4603	if (!cfq_slice_idle)
	4604	cfq_slice_idle = 1;
	4605
	4606	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4607	if (!cfq_group_idle)
	4608	cfq_group_idle = 1;
	4609
	4610	ret = blkcg_policy_register(&blkcg_policy_cfq);
	4611	if (ret)
	4612	return ret;
	4613	#else
	4614	cfq_group_idle = 0;
	4615	#endif
	4616
	4617	ret = -ENOMEM;
	4618	cfq_pool = KMEM_CACHE(cfq_queue, 0);
	4619	if (!cfq_pool)
	4620	goto err_pol_unreg;
	4621
	4622	ret = elv_register(&iosched_cfq);
	4623	if (ret)
	4624	goto err_free_pool;
	4625
	4626	return 0;
	4627
	4628	err_free_pool:
	4629	kmem_cache_destroy(cfq_pool);
	4630	err_pol_unreg:
	4631	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4632	blkcg_policy_unregister(&blkcg_policy_cfq);
	4633	#endif
	4634	return ret;
	4635	}
	4636
	4637	static void __exit cfq_exit(void)
	4638	{
	4639	#ifdef CONFIG_CFQ_GROUP_IOSCHED
	4640	blkcg_policy_unregister(&blkcg_policy_cfq);
	4641	#endif
	4642	elv_unregister(&iosched_cfq);
	4643	kmem_cache_destroy(cfq_pool);
	4644	}
	4645
	4646	module_init(cfq_init);
	4647	module_exit(cfq_exit);
	4648
	4649	MODULE_AUTHOR("Jens Axboe");
	4650	MODULE_LICENSE("GPL");
	4651	MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");