]>
Commit | Line | Data |
---|---|---|
5ddfffbd BC |
1 | /* |
2 | * QEMU throttling infrastructure | |
3 | * | |
4 | * Copyright (C) Nodalink, SARL. 2013 | |
5 | * | |
6 | * Author: | |
7 | * Benoît Canet <[email protected]> | |
8 | * | |
9 | * This program is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU General Public License as | |
11 | * published by the Free Software Foundation; either version 2 or | |
12 | * (at your option) version 3 of the License. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | |
21 | */ | |
22 | ||
23 | #include "qemu/throttle.h" | |
24 | #include "qemu/timer.h" | |
13af91eb | 25 | #include "block/aio.h" |
5ddfffbd BC |
26 | |
27 | /* This function make a bucket leak | |
28 | * | |
29 | * @bkt: the bucket to make leak | |
30 | * @delta_ns: the time delta | |
31 | */ | |
32 | void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) | |
33 | { | |
34 | double leak; | |
35 | ||
36 | /* compute how much to leak */ | |
37 | leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; | |
38 | ||
39 | /* make the bucket leak */ | |
40 | bkt->level = MAX(bkt->level - leak, 0); | |
41 | } | |
42 | ||
43 | /* Calculate the time delta since last leak and make proportionals leaks | |
44 | * | |
45 | * @now: the current timestamp in ns | |
46 | */ | |
47 | static void throttle_do_leak(ThrottleState *ts, int64_t now) | |
48 | { | |
49 | /* compute the time elapsed since the last leak */ | |
50 | int64_t delta_ns = now - ts->previous_leak; | |
51 | int i; | |
52 | ||
53 | ts->previous_leak = now; | |
54 | ||
55 | if (delta_ns <= 0) { | |
56 | return; | |
57 | } | |
58 | ||
59 | /* make each bucket leak */ | |
60 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
61 | throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); | |
62 | } | |
63 | } | |
64 | ||
65 | /* do the real job of computing the time to wait | |
66 | * | |
67 | * @limit: the throttling limit | |
68 | * @extra: the number of operation to delay | |
69 | * @ret: the time to wait in ns | |
70 | */ | |
71 | static int64_t throttle_do_compute_wait(double limit, double extra) | |
72 | { | |
73 | double wait = extra * NANOSECONDS_PER_SECOND; | |
74 | wait /= limit; | |
75 | return wait; | |
76 | } | |
77 | ||
78 | /* This function compute the wait time in ns that a leaky bucket should trigger | |
79 | * | |
80 | * @bkt: the leaky bucket we operate on | |
81 | * @ret: the resulting wait time in ns or 0 if the operation can go through | |
82 | */ | |
83 | int64_t throttle_compute_wait(LeakyBucket *bkt) | |
84 | { | |
85 | double extra; /* the number of extra units blocking the io */ | |
86 | ||
87 | if (!bkt->avg) { | |
88 | return 0; | |
89 | } | |
90 | ||
91 | extra = bkt->level - bkt->max; | |
92 | ||
93 | if (extra <= 0) { | |
94 | return 0; | |
95 | } | |
96 | ||
97 | return throttle_do_compute_wait(bkt->avg, extra); | |
98 | } | |
99 | ||
100 | /* This function compute the time that must be waited while this IO | |
101 | * | |
102 | * @is_write: true if the current IO is a write, false if it's a read | |
103 | * @ret: time to wait | |
104 | */ | |
105 | static int64_t throttle_compute_wait_for(ThrottleState *ts, | |
106 | bool is_write) | |
107 | { | |
108 | BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, | |
109 | THROTTLE_OPS_TOTAL, | |
110 | THROTTLE_BPS_READ, | |
111 | THROTTLE_OPS_READ}, | |
112 | {THROTTLE_BPS_TOTAL, | |
113 | THROTTLE_OPS_TOTAL, | |
114 | THROTTLE_BPS_WRITE, | |
115 | THROTTLE_OPS_WRITE}, }; | |
116 | int64_t wait, max_wait = 0; | |
117 | int i; | |
118 | ||
119 | for (i = 0; i < 4; i++) { | |
120 | BucketType index = to_check[is_write][i]; | |
121 | wait = throttle_compute_wait(&ts->cfg.buckets[index]); | |
122 | if (wait > max_wait) { | |
123 | max_wait = wait; | |
124 | } | |
125 | } | |
126 | ||
127 | return max_wait; | |
128 | } | |
129 | ||
130 | /* compute the timer for this type of operation | |
131 | * | |
132 | * @is_write: the type of operation | |
133 | * @now: the current clock timestamp | |
134 | * @next_timestamp: the resulting timer | |
135 | * @ret: true if a timer must be set | |
136 | */ | |
137 | bool throttle_compute_timer(ThrottleState *ts, | |
138 | bool is_write, | |
139 | int64_t now, | |
140 | int64_t *next_timestamp) | |
141 | { | |
142 | int64_t wait; | |
143 | ||
144 | /* leak proportionally to the time elapsed */ | |
145 | throttle_do_leak(ts, now); | |
146 | ||
147 | /* compute the wait time if any */ | |
148 | wait = throttle_compute_wait_for(ts, is_write); | |
149 | ||
150 | /* if the code must wait compute when the next timer should fire */ | |
151 | if (wait) { | |
152 | *next_timestamp = now + wait; | |
153 | return true; | |
154 | } | |
155 | ||
156 | /* else no need to wait at all */ | |
157 | *next_timestamp = now; | |
158 | return false; | |
159 | } | |
160 | ||
13af91eb | 161 | /* Add timers to event loop */ |
0e5b0a2d BC |
162 | void throttle_timers_attach_aio_context(ThrottleTimers *tt, |
163 | AioContext *new_context) | |
13af91eb | 164 | { |
0e5b0a2d BC |
165 | tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, |
166 | tt->read_timer_cb, tt->timer_opaque); | |
167 | tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, | |
168 | tt->write_timer_cb, tt->timer_opaque); | |
13af91eb SH |
169 | } |
170 | ||
5ddfffbd | 171 | /* To be called first on the ThrottleState */ |
0e5b0a2d | 172 | void throttle_init(ThrottleState *ts) |
5ddfffbd BC |
173 | { |
174 | memset(ts, 0, sizeof(ThrottleState)); | |
0e5b0a2d BC |
175 | } |
176 | ||
177 | /* To be called first on the ThrottleTimers */ | |
178 | void throttle_timers_init(ThrottleTimers *tt, | |
179 | AioContext *aio_context, | |
180 | QEMUClockType clock_type, | |
181 | QEMUTimerCB *read_timer_cb, | |
182 | QEMUTimerCB *write_timer_cb, | |
183 | void *timer_opaque) | |
184 | { | |
185 | memset(tt, 0, sizeof(ThrottleTimers)); | |
5ddfffbd | 186 | |
0e5b0a2d BC |
187 | tt->clock_type = clock_type; |
188 | tt->read_timer_cb = read_timer_cb; | |
189 | tt->write_timer_cb = write_timer_cb; | |
190 | tt->timer_opaque = timer_opaque; | |
191 | throttle_timers_attach_aio_context(tt, aio_context); | |
5ddfffbd BC |
192 | } |
193 | ||
194 | /* destroy a timer */ | |
195 | static void throttle_timer_destroy(QEMUTimer **timer) | |
196 | { | |
197 | assert(*timer != NULL); | |
198 | ||
199 | timer_del(*timer); | |
200 | timer_free(*timer); | |
201 | *timer = NULL; | |
202 | } | |
203 | ||
13af91eb | 204 | /* Remove timers from event loop */ |
0e5b0a2d | 205 | void throttle_timers_detach_aio_context(ThrottleTimers *tt) |
5ddfffbd BC |
206 | { |
207 | int i; | |
208 | ||
209 | for (i = 0; i < 2; i++) { | |
0e5b0a2d | 210 | throttle_timer_destroy(&tt->timers[i]); |
5ddfffbd BC |
211 | } |
212 | } | |
213 | ||
0e5b0a2d BC |
214 | /* To be called last on the ThrottleTimers */ |
215 | void throttle_timers_destroy(ThrottleTimers *tt) | |
13af91eb | 216 | { |
0e5b0a2d | 217 | throttle_timers_detach_aio_context(tt); |
13af91eb SH |
218 | } |
219 | ||
5ddfffbd | 220 | /* is any throttling timer configured */ |
0e5b0a2d | 221 | bool throttle_timers_are_initialized(ThrottleTimers *tt) |
5ddfffbd | 222 | { |
0e5b0a2d | 223 | if (tt->timers[0]) { |
5ddfffbd BC |
224 | return true; |
225 | } | |
226 | ||
227 | return false; | |
228 | } | |
229 | ||
230 | /* Does any throttling must be done | |
231 | * | |
232 | * @cfg: the throttling configuration to inspect | |
233 | * @ret: true if throttling must be done else false | |
234 | */ | |
235 | bool throttle_enabled(ThrottleConfig *cfg) | |
236 | { | |
237 | int i; | |
238 | ||
239 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
240 | if (cfg->buckets[i].avg > 0) { | |
241 | return true; | |
242 | } | |
243 | } | |
244 | ||
245 | return false; | |
246 | } | |
247 | ||
248 | /* return true if any two throttling parameters conflicts | |
249 | * | |
250 | * @cfg: the throttling configuration to inspect | |
251 | * @ret: true if any conflict detected else false | |
252 | */ | |
253 | bool throttle_conflicting(ThrottleConfig *cfg) | |
254 | { | |
255 | bool bps_flag, ops_flag; | |
256 | bool bps_max_flag, ops_max_flag; | |
257 | ||
258 | bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && | |
259 | (cfg->buckets[THROTTLE_BPS_READ].avg || | |
260 | cfg->buckets[THROTTLE_BPS_WRITE].avg); | |
261 | ||
262 | ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && | |
263 | (cfg->buckets[THROTTLE_OPS_READ].avg || | |
264 | cfg->buckets[THROTTLE_OPS_WRITE].avg); | |
265 | ||
266 | bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && | |
267 | (cfg->buckets[THROTTLE_BPS_READ].max || | |
268 | cfg->buckets[THROTTLE_BPS_WRITE].max); | |
269 | ||
270 | ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && | |
271 | (cfg->buckets[THROTTLE_OPS_READ].max || | |
272 | cfg->buckets[THROTTLE_OPS_WRITE].max); | |
273 | ||
274 | return bps_flag || ops_flag || bps_max_flag || ops_max_flag; | |
275 | } | |
276 | ||
277 | /* check if a throttling configuration is valid | |
278 | * @cfg: the throttling configuration to inspect | |
279 | * @ret: true if valid else false | |
280 | */ | |
281 | bool throttle_is_valid(ThrottleConfig *cfg) | |
282 | { | |
283 | bool invalid = false; | |
284 | int i; | |
285 | ||
286 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
287 | if (cfg->buckets[i].avg < 0) { | |
288 | invalid = true; | |
289 | } | |
290 | } | |
291 | ||
292 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
293 | if (cfg->buckets[i].max < 0) { | |
294 | invalid = true; | |
295 | } | |
296 | } | |
297 | ||
298 | return !invalid; | |
299 | } | |
300 | ||
301 | /* fix bucket parameters */ | |
302 | static void throttle_fix_bucket(LeakyBucket *bkt) | |
303 | { | |
304 | double min; | |
305 | ||
306 | /* zero bucket level */ | |
307 | bkt->level = 0; | |
308 | ||
309 | /* The following is done to cope with the Linux CFQ block scheduler | |
310 | * which regroup reads and writes by block of 100ms in the guest. | |
311 | * When they are two process one making reads and one making writes cfq | |
312 | * make a pattern looking like the following: | |
313 | * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR | |
314 | * Having a max burst value of 100ms of the average will help smooth the | |
315 | * throttling | |
316 | */ | |
317 | min = bkt->avg / 10; | |
318 | if (bkt->avg && !bkt->max) { | |
319 | bkt->max = min; | |
320 | } | |
321 | } | |
322 | ||
323 | /* take care of canceling a timer */ | |
324 | static void throttle_cancel_timer(QEMUTimer *timer) | |
325 | { | |
326 | assert(timer != NULL); | |
327 | ||
328 | timer_del(timer); | |
329 | } | |
330 | ||
331 | /* Used to configure the throttle | |
332 | * | |
333 | * @ts: the throttle state we are working on | |
0e5b0a2d | 334 | * @tt: the throttle timers we use in this aio context |
5ddfffbd BC |
335 | * @cfg: the config to set |
336 | */ | |
0e5b0a2d BC |
337 | void throttle_config(ThrottleState *ts, |
338 | ThrottleTimers *tt, | |
339 | ThrottleConfig *cfg) | |
5ddfffbd BC |
340 | { |
341 | int i; | |
342 | ||
343 | ts->cfg = *cfg; | |
344 | ||
345 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
346 | throttle_fix_bucket(&ts->cfg.buckets[i]); | |
347 | } | |
348 | ||
0e5b0a2d | 349 | ts->previous_leak = qemu_clock_get_ns(tt->clock_type); |
5ddfffbd BC |
350 | |
351 | for (i = 0; i < 2; i++) { | |
0e5b0a2d | 352 | throttle_cancel_timer(tt->timers[i]); |
5ddfffbd BC |
353 | } |
354 | } | |
355 | ||
356 | /* used to get config | |
357 | * | |
358 | * @ts: the throttle state we are working on | |
359 | * @cfg: the config to write | |
360 | */ | |
361 | void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) | |
362 | { | |
363 | *cfg = ts->cfg; | |
364 | } | |
365 | ||
366 | ||
367 | /* Schedule the read or write timer if needed | |
368 | * | |
369 | * NOTE: this function is not unit tested due to it's usage of timer_mod | |
370 | * | |
0e5b0a2d | 371 | * @tt: the timers structure |
5ddfffbd BC |
372 | * @is_write: the type of operation (read/write) |
373 | * @ret: true if the timer has been scheduled else false | |
374 | */ | |
0e5b0a2d BC |
375 | bool throttle_schedule_timer(ThrottleState *ts, |
376 | ThrottleTimers *tt, | |
377 | bool is_write) | |
5ddfffbd | 378 | { |
0e5b0a2d | 379 | int64_t now = qemu_clock_get_ns(tt->clock_type); |
5ddfffbd BC |
380 | int64_t next_timestamp; |
381 | bool must_wait; | |
382 | ||
383 | must_wait = throttle_compute_timer(ts, | |
384 | is_write, | |
385 | now, | |
386 | &next_timestamp); | |
387 | ||
388 | /* request not throttled */ | |
389 | if (!must_wait) { | |
390 | return false; | |
391 | } | |
392 | ||
393 | /* request throttled and timer pending -> do nothing */ | |
0e5b0a2d | 394 | if (timer_pending(tt->timers[is_write])) { |
5ddfffbd BC |
395 | return true; |
396 | } | |
397 | ||
398 | /* request throttled and timer not pending -> arm timer */ | |
0e5b0a2d | 399 | timer_mod(tt->timers[is_write], next_timestamp); |
5ddfffbd BC |
400 | return true; |
401 | } | |
402 | ||
403 | /* do the accounting for this operation | |
404 | * | |
405 | * @is_write: the type of operation (read/write) | |
406 | * @size: the size of the operation | |
407 | */ | |
408 | void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) | |
409 | { | |
410 | double units = 1.0; | |
411 | ||
412 | /* if cfg.op_size is defined and smaller than size we compute unit count */ | |
413 | if (ts->cfg.op_size && size > ts->cfg.op_size) { | |
414 | units = (double) size / ts->cfg.op_size; | |
415 | } | |
416 | ||
417 | ts->cfg.buckets[THROTTLE_BPS_TOTAL].level += size; | |
418 | ts->cfg.buckets[THROTTLE_OPS_TOTAL].level += units; | |
419 | ||
420 | if (is_write) { | |
421 | ts->cfg.buckets[THROTTLE_BPS_WRITE].level += size; | |
422 | ts->cfg.buckets[THROTTLE_OPS_WRITE].level += units; | |
423 | } else { | |
424 | ts->cfg.buckets[THROTTLE_BPS_READ].level += size; | |
425 | ts->cfg.buckets[THROTTLE_OPS_READ].level += units; | |
426 | } | |
427 | } | |
428 |