]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * QEMU throttling infrastructure | |
3 | * | |
4 | * Copyright (C) Nodalink, EURL. 2013-2014 | |
5 | * Copyright (C) Igalia, S.L. 2015 | |
6 | * | |
7 | * Authors: | |
8 | * BenoƮt Canet <[email protected]> | |
9 | * Alberto Garcia <[email protected]> | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU General Public License as | |
13 | * published by the Free Software Foundation; either version 2 or | |
14 | * (at your option) version 3 of the License. | |
15 | * | |
16 | * This program is distributed in the hope that it will be useful, | |
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | * GNU General Public License for more details. | |
20 | * | |
21 | * You should have received a copy of the GNU General Public License | |
22 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | |
23 | */ | |
24 | ||
25 | #include "qemu/osdep.h" | |
26 | #include "qapi/error.h" | |
27 | #include "qemu/throttle.h" | |
28 | #include "qemu/timer.h" | |
29 | #include "block/aio.h" | |
30 | ||
31 | /* This function make a bucket leak | |
32 | * | |
33 | * @bkt: the bucket to make leak | |
34 | * @delta_ns: the time delta | |
35 | */ | |
36 | void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) | |
37 | { | |
38 | double leak; | |
39 | ||
40 | /* compute how much to leak */ | |
41 | leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; | |
42 | ||
43 | /* make the bucket leak */ | |
44 | bkt->level = MAX(bkt->level - leak, 0); | |
45 | ||
46 | /* if we allow bursts for more than one second we also need to | |
47 | * keep track of bkt->burst_level so the bkt->max goal per second | |
48 | * is attained */ | |
49 | if (bkt->burst_length > 1) { | |
50 | leak = (bkt->max * (double) delta_ns) / NANOSECONDS_PER_SECOND; | |
51 | bkt->burst_level = MAX(bkt->burst_level - leak, 0); | |
52 | } | |
53 | } | |
54 | ||
55 | /* Calculate the time delta since last leak and make proportionals leaks | |
56 | * | |
57 | * @now: the current timestamp in ns | |
58 | */ | |
59 | static void throttle_do_leak(ThrottleState *ts, int64_t now) | |
60 | { | |
61 | /* compute the time elapsed since the last leak */ | |
62 | int64_t delta_ns = now - ts->previous_leak; | |
63 | int i; | |
64 | ||
65 | ts->previous_leak = now; | |
66 | ||
67 | if (delta_ns <= 0) { | |
68 | return; | |
69 | } | |
70 | ||
71 | /* make each bucket leak */ | |
72 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
73 | throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); | |
74 | } | |
75 | } | |
76 | ||
77 | /* do the real job of computing the time to wait | |
78 | * | |
79 | * @limit: the throttling limit | |
80 | * @extra: the number of operation to delay | |
81 | * @ret: the time to wait in ns | |
82 | */ | |
83 | static int64_t throttle_do_compute_wait(double limit, double extra) | |
84 | { | |
85 | double wait = extra * NANOSECONDS_PER_SECOND; | |
86 | wait /= limit; | |
87 | return wait; | |
88 | } | |
89 | ||
90 | /* This function compute the wait time in ns that a leaky bucket should trigger | |
91 | * | |
92 | * @bkt: the leaky bucket we operate on | |
93 | * @ret: the resulting wait time in ns or 0 if the operation can go through | |
94 | */ | |
95 | int64_t throttle_compute_wait(LeakyBucket *bkt) | |
96 | { | |
97 | double extra; /* the number of extra units blocking the io */ | |
98 | ||
99 | if (!bkt->avg) { | |
100 | return 0; | |
101 | } | |
102 | ||
103 | /* If the bucket is full then we have to wait */ | |
104 | extra = bkt->level - bkt->max * bkt->burst_length; | |
105 | if (extra > 0) { | |
106 | return throttle_do_compute_wait(bkt->avg, extra); | |
107 | } | |
108 | ||
109 | /* If the bucket is not full yet we have to make sure that we | |
110 | * fulfill the goal of bkt->max units per second. */ | |
111 | if (bkt->burst_length > 1) { | |
112 | /* We use 1/10 of the max value to smooth the throttling. | |
113 | * See throttle_fix_bucket() for more details. */ | |
114 | extra = bkt->burst_level - bkt->max / 10; | |
115 | if (extra > 0) { | |
116 | return throttle_do_compute_wait(bkt->max, extra); | |
117 | } | |
118 | } | |
119 | ||
120 | return 0; | |
121 | } | |
122 | ||
123 | /* This function compute the time that must be waited while this IO | |
124 | * | |
125 | * @is_write: true if the current IO is a write, false if it's a read | |
126 | * @ret: time to wait | |
127 | */ | |
128 | static int64_t throttle_compute_wait_for(ThrottleState *ts, | |
129 | bool is_write) | |
130 | { | |
131 | BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, | |
132 | THROTTLE_OPS_TOTAL, | |
133 | THROTTLE_BPS_READ, | |
134 | THROTTLE_OPS_READ}, | |
135 | {THROTTLE_BPS_TOTAL, | |
136 | THROTTLE_OPS_TOTAL, | |
137 | THROTTLE_BPS_WRITE, | |
138 | THROTTLE_OPS_WRITE}, }; | |
139 | int64_t wait, max_wait = 0; | |
140 | int i; | |
141 | ||
142 | for (i = 0; i < 4; i++) { | |
143 | BucketType index = to_check[is_write][i]; | |
144 | wait = throttle_compute_wait(&ts->cfg.buckets[index]); | |
145 | if (wait > max_wait) { | |
146 | max_wait = wait; | |
147 | } | |
148 | } | |
149 | ||
150 | return max_wait; | |
151 | } | |
152 | ||
153 | /* compute the timer for this type of operation | |
154 | * | |
155 | * @is_write: the type of operation | |
156 | * @now: the current clock timestamp | |
157 | * @next_timestamp: the resulting timer | |
158 | * @ret: true if a timer must be set | |
159 | */ | |
160 | static bool throttle_compute_timer(ThrottleState *ts, | |
161 | bool is_write, | |
162 | int64_t now, | |
163 | int64_t *next_timestamp) | |
164 | { | |
165 | int64_t wait; | |
166 | ||
167 | /* leak proportionally to the time elapsed */ | |
168 | throttle_do_leak(ts, now); | |
169 | ||
170 | /* compute the wait time if any */ | |
171 | wait = throttle_compute_wait_for(ts, is_write); | |
172 | ||
173 | /* if the code must wait compute when the next timer should fire */ | |
174 | if (wait) { | |
175 | *next_timestamp = now + wait; | |
176 | return true; | |
177 | } | |
178 | ||
179 | /* else no need to wait at all */ | |
180 | *next_timestamp = now; | |
181 | return false; | |
182 | } | |
183 | ||
184 | /* Add timers to event loop */ | |
185 | void throttle_timers_attach_aio_context(ThrottleTimers *tt, | |
186 | AioContext *new_context) | |
187 | { | |
188 | tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, | |
189 | tt->read_timer_cb, tt->timer_opaque); | |
190 | tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, | |
191 | tt->write_timer_cb, tt->timer_opaque); | |
192 | } | |
193 | ||
194 | /* | |
195 | * Initialize the ThrottleConfig structure to a valid state | |
196 | * @cfg: the config to initialize | |
197 | */ | |
198 | void throttle_config_init(ThrottleConfig *cfg) | |
199 | { | |
200 | unsigned i; | |
201 | memset(cfg, 0, sizeof(*cfg)); | |
202 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
203 | cfg->buckets[i].burst_length = 1; | |
204 | } | |
205 | } | |
206 | ||
207 | /* To be called first on the ThrottleState */ | |
208 | void throttle_init(ThrottleState *ts) | |
209 | { | |
210 | memset(ts, 0, sizeof(ThrottleState)); | |
211 | throttle_config_init(&ts->cfg); | |
212 | } | |
213 | ||
214 | /* To be called first on the ThrottleTimers */ | |
215 | void throttle_timers_init(ThrottleTimers *tt, | |
216 | AioContext *aio_context, | |
217 | QEMUClockType clock_type, | |
218 | QEMUTimerCB *read_timer_cb, | |
219 | QEMUTimerCB *write_timer_cb, | |
220 | void *timer_opaque) | |
221 | { | |
222 | memset(tt, 0, sizeof(ThrottleTimers)); | |
223 | ||
224 | tt->clock_type = clock_type; | |
225 | tt->read_timer_cb = read_timer_cb; | |
226 | tt->write_timer_cb = write_timer_cb; | |
227 | tt->timer_opaque = timer_opaque; | |
228 | throttle_timers_attach_aio_context(tt, aio_context); | |
229 | } | |
230 | ||
231 | /* destroy a timer */ | |
232 | static void throttle_timer_destroy(QEMUTimer **timer) | |
233 | { | |
234 | assert(*timer != NULL); | |
235 | ||
236 | timer_del(*timer); | |
237 | timer_free(*timer); | |
238 | *timer = NULL; | |
239 | } | |
240 | ||
241 | /* Remove timers from event loop */ | |
242 | void throttle_timers_detach_aio_context(ThrottleTimers *tt) | |
243 | { | |
244 | int i; | |
245 | ||
246 | for (i = 0; i < 2; i++) { | |
247 | throttle_timer_destroy(&tt->timers[i]); | |
248 | } | |
249 | } | |
250 | ||
251 | /* To be called last on the ThrottleTimers */ | |
252 | void throttle_timers_destroy(ThrottleTimers *tt) | |
253 | { | |
254 | throttle_timers_detach_aio_context(tt); | |
255 | } | |
256 | ||
257 | /* is any throttling timer configured */ | |
258 | bool throttle_timers_are_initialized(ThrottleTimers *tt) | |
259 | { | |
260 | if (tt->timers[0]) { | |
261 | return true; | |
262 | } | |
263 | ||
264 | return false; | |
265 | } | |
266 | ||
267 | /* Does any throttling must be done | |
268 | * | |
269 | * @cfg: the throttling configuration to inspect | |
270 | * @ret: true if throttling must be done else false | |
271 | */ | |
272 | bool throttle_enabled(ThrottleConfig *cfg) | |
273 | { | |
274 | int i; | |
275 | ||
276 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
277 | if (cfg->buckets[i].avg > 0) { | |
278 | return true; | |
279 | } | |
280 | } | |
281 | ||
282 | return false; | |
283 | } | |
284 | ||
285 | /* check if a throttling configuration is valid | |
286 | * @cfg: the throttling configuration to inspect | |
287 | * @ret: true if valid else false | |
288 | * @errp: error object | |
289 | */ | |
290 | bool throttle_is_valid(ThrottleConfig *cfg, Error **errp) | |
291 | { | |
292 | int i; | |
293 | bool bps_flag, ops_flag; | |
294 | bool bps_max_flag, ops_max_flag; | |
295 | ||
296 | bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && | |
297 | (cfg->buckets[THROTTLE_BPS_READ].avg || | |
298 | cfg->buckets[THROTTLE_BPS_WRITE].avg); | |
299 | ||
300 | ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && | |
301 | (cfg->buckets[THROTTLE_OPS_READ].avg || | |
302 | cfg->buckets[THROTTLE_OPS_WRITE].avg); | |
303 | ||
304 | bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && | |
305 | (cfg->buckets[THROTTLE_BPS_READ].max || | |
306 | cfg->buckets[THROTTLE_BPS_WRITE].max); | |
307 | ||
308 | ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && | |
309 | (cfg->buckets[THROTTLE_OPS_READ].max || | |
310 | cfg->buckets[THROTTLE_OPS_WRITE].max); | |
311 | ||
312 | if (bps_flag || ops_flag || bps_max_flag || ops_max_flag) { | |
313 | error_setg(errp, "bps/iops/max total values and read/write values" | |
314 | " cannot be used at the same time"); | |
315 | return false; | |
316 | } | |
317 | ||
318 | if (cfg->op_size && | |
319 | !cfg->buckets[THROTTLE_OPS_TOTAL].avg && | |
320 | !cfg->buckets[THROTTLE_OPS_READ].avg && | |
321 | !cfg->buckets[THROTTLE_OPS_WRITE].avg) { | |
322 | error_setg(errp, "iops size requires an iops value to be set"); | |
323 | return false; | |
324 | } | |
325 | ||
326 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
327 | if (cfg->buckets[i].avg < 0 || | |
328 | cfg->buckets[i].max < 0 || | |
329 | cfg->buckets[i].avg > THROTTLE_VALUE_MAX || | |
330 | cfg->buckets[i].max > THROTTLE_VALUE_MAX) { | |
331 | error_setg(errp, "bps/iops/max values must be within [0, %lld]", | |
332 | THROTTLE_VALUE_MAX); | |
333 | return false; | |
334 | } | |
335 | ||
336 | if (!cfg->buckets[i].burst_length) { | |
337 | error_setg(errp, "the burst length cannot be 0"); | |
338 | return false; | |
339 | } | |
340 | ||
341 | if (cfg->buckets[i].burst_length > 1 && !cfg->buckets[i].max) { | |
342 | error_setg(errp, "burst length set without burst rate"); | |
343 | return false; | |
344 | } | |
345 | ||
346 | if (cfg->buckets[i].max && !cfg->buckets[i].avg) { | |
347 | error_setg(errp, "bps_max/iops_max require corresponding" | |
348 | " bps/iops values"); | |
349 | return false; | |
350 | } | |
351 | ||
352 | if (cfg->buckets[i].max && cfg->buckets[i].max < cfg->buckets[i].avg) { | |
353 | error_setg(errp, "bps_max/iops_max cannot be lower than bps/iops"); | |
354 | return false; | |
355 | } | |
356 | } | |
357 | ||
358 | return true; | |
359 | } | |
360 | ||
361 | /* fix bucket parameters */ | |
362 | static void throttle_fix_bucket(LeakyBucket *bkt) | |
363 | { | |
364 | double min; | |
365 | ||
366 | /* zero bucket level */ | |
367 | bkt->level = bkt->burst_level = 0; | |
368 | ||
369 | /* The following is done to cope with the Linux CFQ block scheduler | |
370 | * which regroup reads and writes by block of 100ms in the guest. | |
371 | * When they are two process one making reads and one making writes cfq | |
372 | * make a pattern looking like the following: | |
373 | * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR | |
374 | * Having a max burst value of 100ms of the average will help smooth the | |
375 | * throttling | |
376 | */ | |
377 | min = bkt->avg / 10; | |
378 | if (bkt->avg && !bkt->max) { | |
379 | bkt->max = min; | |
380 | } | |
381 | } | |
382 | ||
383 | /* undo internal bucket parameter changes (see throttle_fix_bucket()) */ | |
384 | static void throttle_unfix_bucket(LeakyBucket *bkt) | |
385 | { | |
386 | if (bkt->max < bkt->avg) { | |
387 | bkt->max = 0; | |
388 | } | |
389 | } | |
390 | ||
391 | /* take care of canceling a timer */ | |
392 | static void throttle_cancel_timer(QEMUTimer *timer) | |
393 | { | |
394 | assert(timer != NULL); | |
395 | ||
396 | timer_del(timer); | |
397 | } | |
398 | ||
399 | /* Used to configure the throttle | |
400 | * | |
401 | * @ts: the throttle state we are working on | |
402 | * @tt: the throttle timers we use in this aio context | |
403 | * @cfg: the config to set | |
404 | */ | |
405 | void throttle_config(ThrottleState *ts, | |
406 | ThrottleTimers *tt, | |
407 | ThrottleConfig *cfg) | |
408 | { | |
409 | int i; | |
410 | ||
411 | ts->cfg = *cfg; | |
412 | ||
413 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
414 | throttle_fix_bucket(&ts->cfg.buckets[i]); | |
415 | } | |
416 | ||
417 | ts->previous_leak = qemu_clock_get_ns(tt->clock_type); | |
418 | ||
419 | for (i = 0; i < 2; i++) { | |
420 | throttle_cancel_timer(tt->timers[i]); | |
421 | } | |
422 | } | |
423 | ||
424 | /* used to get config | |
425 | * | |
426 | * @ts: the throttle state we are working on | |
427 | * @cfg: the config to write | |
428 | */ | |
429 | void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) | |
430 | { | |
431 | int i; | |
432 | ||
433 | *cfg = ts->cfg; | |
434 | ||
435 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
436 | throttle_unfix_bucket(&cfg->buckets[i]); | |
437 | } | |
438 | } | |
439 | ||
440 | ||
441 | /* Schedule the read or write timer if needed | |
442 | * | |
443 | * NOTE: this function is not unit tested due to it's usage of timer_mod | |
444 | * | |
445 | * @tt: the timers structure | |
446 | * @is_write: the type of operation (read/write) | |
447 | * @ret: true if the timer has been scheduled else false | |
448 | */ | |
449 | bool throttle_schedule_timer(ThrottleState *ts, | |
450 | ThrottleTimers *tt, | |
451 | bool is_write) | |
452 | { | |
453 | int64_t now = qemu_clock_get_ns(tt->clock_type); | |
454 | int64_t next_timestamp; | |
455 | bool must_wait; | |
456 | ||
457 | must_wait = throttle_compute_timer(ts, | |
458 | is_write, | |
459 | now, | |
460 | &next_timestamp); | |
461 | ||
462 | /* request not throttled */ | |
463 | if (!must_wait) { | |
464 | return false; | |
465 | } | |
466 | ||
467 | /* request throttled and timer pending -> do nothing */ | |
468 | if (timer_pending(tt->timers[is_write])) { | |
469 | return true; | |
470 | } | |
471 | ||
472 | /* request throttled and timer not pending -> arm timer */ | |
473 | timer_mod(tt->timers[is_write], next_timestamp); | |
474 | return true; | |
475 | } | |
476 | ||
477 | /* do the accounting for this operation | |
478 | * | |
479 | * @is_write: the type of operation (read/write) | |
480 | * @size: the size of the operation | |
481 | */ | |
482 | void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) | |
483 | { | |
484 | const BucketType bucket_types_size[2][2] = { | |
485 | { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ }, | |
486 | { THROTTLE_BPS_TOTAL, THROTTLE_BPS_WRITE } | |
487 | }; | |
488 | const BucketType bucket_types_units[2][2] = { | |
489 | { THROTTLE_OPS_TOTAL, THROTTLE_OPS_READ }, | |
490 | { THROTTLE_OPS_TOTAL, THROTTLE_OPS_WRITE } | |
491 | }; | |
492 | double units = 1.0; | |
493 | unsigned i; | |
494 | ||
495 | /* if cfg.op_size is defined and smaller than size we compute unit count */ | |
496 | if (ts->cfg.op_size && size > ts->cfg.op_size) { | |
497 | units = (double) size / ts->cfg.op_size; | |
498 | } | |
499 | ||
500 | for (i = 0; i < 2; i++) { | |
501 | LeakyBucket *bkt; | |
502 | ||
503 | bkt = &ts->cfg.buckets[bucket_types_size[is_write][i]]; | |
504 | bkt->level += size; | |
505 | if (bkt->burst_length > 1) { | |
506 | bkt->burst_level += size; | |
507 | } | |
508 | ||
509 | bkt = &ts->cfg.buckets[bucket_types_units[is_write][i]]; | |
510 | bkt->level += units; | |
511 | if (bkt->burst_length > 1) { | |
512 | bkt->burst_level += units; | |
513 | } | |
514 | } | |
515 | } | |
516 |