block/blk-cgroup.c

   1 /*
   2  * Common Block IO controller cgroup interface
   3  *
   4  * Based on ideas and code from CFQ, CFS and BFQ:
   5  * Copyright (C) 2003 Jens Axboe <[email protected]>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <[email protected]>
   8  *                    Paolo Valente <[email protected]>
   9  *
  10  * Copyright (C) 2009 Vivek Goyal <[email protected]>
  11  *                    Nauman Rafique <[email protected]>
  12  */
  13 #include <linux/ioprio.h>
  14 #include <linux/seq_file.h>
  15 #include <linux/kdev_t.h>
  16 #include <linux/module.h>
  17 #include <linux/err.h>
  18 #include <linux/blkdev.h>
  19 #include <linux/slab.h>
  20 #include <linux/genhd.h>
  21 #include <linux/delay.h>
  22 #include "blk-cgroup.h"
  23 #include "blk.h"
  24
  25 #define MAX_KEY_LEN 100
  26
  27 static DEFINE_SPINLOCK(blkio_list_lock);
  28 static LIST_HEAD(blkio_list);
  29
  30 static DEFINE_MUTEX(all_q_mutex);
  31 static LIST_HEAD(all_q_list);
  32
  33 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  34 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  35
  36 static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
  37
  38 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  39                                                   struct cgroup *);
  40 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  41                               struct cgroup_taskset *);
  42 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  43                            struct cgroup_taskset *);
  44 static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
  45 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  46 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  47
  48 /* for encoding cft->private value on file */
  49 #define BLKIOFILE_PRIVATE(x, val)       (((x) << 16) | (val))
  50 /* What policy owns the file, proportional or throttle */
  51 #define BLKIOFILE_POLICY(val)           (((val) >> 16) & 0xffff)
  52 #define BLKIOFILE_ATTR(val)             ((val) & 0xffff)
  53
  54 struct cgroup_subsys blkio_subsys = {
  55         .name = "blkio",
  56         .create = blkiocg_create,
  57         .can_attach = blkiocg_can_attach,
  58         .attach = blkiocg_attach,
  59         .pre_destroy = blkiocg_pre_destroy,
  60         .destroy = blkiocg_destroy,
  61         .populate = blkiocg_populate,
  62         .subsys_id = blkio_subsys_id,
  63         .module = THIS_MODULE,
  64 };
  65 EXPORT_SYMBOL_GPL(blkio_subsys);
  66
  67 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  68 {
  69         return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  70                             struct blkio_cgroup, css);
  71 }
  72 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  73
  74 struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
  75 {
  76         return container_of(task_subsys_state(tsk, blkio_subsys_id),
  77                             struct blkio_cgroup, css);
  78 }
  79 EXPORT_SYMBOL_GPL(task_blkio_cgroup);
  80
  81 static inline void blkio_update_group_weight(struct blkio_group *blkg,
  82                                              int plid, unsigned int weight)
  83 {
  84         struct blkio_policy_type *blkiop;
  85
  86         list_for_each_entry(blkiop, &blkio_list, list) {
  87                 /* If this policy does not own the blkg, do not send updates */
  88                 if (blkiop->plid != plid)
  89                         continue;
  90                 if (blkiop->ops.blkio_update_group_weight_fn)
  91                         blkiop->ops.blkio_update_group_weight_fn(blkg->q,
  92                                                         blkg, weight);
  93         }
  94 }
  95
  96 static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
  97                                           u64 bps, int fileid)
  98 {
  99         struct blkio_policy_type *blkiop;
 100
 101         list_for_each_entry(blkiop, &blkio_list, list) {
 102
 103                 /* If this policy does not own the blkg, do not send updates */
 104                 if (blkiop->plid != plid)
 105                         continue;
 106
 107                 if (fileid == BLKIO_THROTL_read_bps_device
 108                     && blkiop->ops.blkio_update_group_read_bps_fn)
 109                         blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
 110                                                                 blkg, bps);
 111
 112                 if (fileid == BLKIO_THROTL_write_bps_device
 113                     && blkiop->ops.blkio_update_group_write_bps_fn)
 114                         blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
 115                                                                 blkg, bps);
 116         }
 117 }
 118
 119 static inline void blkio_update_group_iops(struct blkio_group *blkg,
 120                                            int plid, unsigned int iops,
 121                                            int fileid)
 122 {
 123         struct blkio_policy_type *blkiop;
 124
 125         list_for_each_entry(blkiop, &blkio_list, list) {
 126
 127                 /* If this policy does not own the blkg, do not send updates */
 128                 if (blkiop->plid != plid)
 129                         continue;
 130
 131                 if (fileid == BLKIO_THROTL_read_iops_device
 132                     && blkiop->ops.blkio_update_group_read_iops_fn)
 133                         blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
 134                                                                 blkg, iops);
 135
 136                 if (fileid == BLKIO_THROTL_write_iops_device
 137                     && blkiop->ops.blkio_update_group_write_iops_fn)
 138                         blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
 139                                                                 blkg,iops);
 140         }
 141 }
 142
 143 /*
 144  * Add to the appropriate stat variable depending on the request type.
 145  * This should be called with the blkg->stats_lock held.
 146  */
 147 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 148                                 bool sync)
 149 {
 150         if (direction)
 151                 stat[BLKIO_STAT_WRITE] += add;
 152         else
 153                 stat[BLKIO_STAT_READ] += add;
 154         if (sync)
 155                 stat[BLKIO_STAT_SYNC] += add;
 156         else
 157                 stat[BLKIO_STAT_ASYNC] += add;
 158 }
 159
 160 /*
 161  * Decrements the appropriate stat variable if non-zero depending on the
 162  * request type. Panics on value being zero.
 163  * This should be called with the blkg->stats_lock held.
 164  */
 165 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 166 {
 167         if (direction) {
 168                 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
 169                 stat[BLKIO_STAT_WRITE]--;
 170         } else {
 171                 BUG_ON(stat[BLKIO_STAT_READ] == 0);
 172                 stat[BLKIO_STAT_READ]--;
 173         }
 174         if (sync) {
 175                 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
 176                 stat[BLKIO_STAT_SYNC]--;
 177         } else {
 178                 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
 179                 stat[BLKIO_STAT_ASYNC]--;
 180         }
 181 }
 182
 183 #ifdef CONFIG_DEBUG_BLK_CGROUP
 184 /* This should be called with the blkg->stats_lock held. */
 185 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 186                                             struct blkio_policy_type *pol,
 187                                             struct blkio_group *curr_blkg)
 188 {
 189         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 190
 191         if (blkio_blkg_waiting(&pd->stats))
 192                 return;
 193         if (blkg == curr_blkg)
 194                 return;
 195         pd->stats.start_group_wait_time = sched_clock();
 196         blkio_mark_blkg_waiting(&pd->stats);
 197 }
 198
 199 /* This should be called with the blkg->stats_lock held. */
 200 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 201 {
 202         unsigned long long now;
 203
 204         if (!blkio_blkg_waiting(stats))
 205                 return;
 206
 207         now = sched_clock();
 208         if (time_after64(now, stats->start_group_wait_time))
 209                 stats->group_wait_time += now - stats->start_group_wait_time;
 210         blkio_clear_blkg_waiting(stats);
 211 }
 212
 213 /* This should be called with the blkg->stats_lock held. */
 214 static void blkio_end_empty_time(struct blkio_group_stats *stats)
 215 {
 216         unsigned long long now;
 217
 218         if (!blkio_blkg_empty(stats))
 219                 return;
 220
 221         now = sched_clock();
 222         if (time_after64(now, stats->start_empty_time))
 223                 stats->empty_time += now - stats->start_empty_time;
 224         blkio_clear_blkg_empty(stats);
 225 }
 226
 227 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
 228                                         struct blkio_policy_type *pol)
 229 {
 230         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 231         unsigned long flags;
 232
 233         spin_lock_irqsave(&blkg->stats_lock, flags);
 234         BUG_ON(blkio_blkg_idling(&pd->stats));
 235         pd->stats.start_idle_time = sched_clock();
 236         blkio_mark_blkg_idling(&pd->stats);
 237         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 238 }
 239 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 240
 241 void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
 242                                     struct blkio_policy_type *pol)
 243 {
 244         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 245         unsigned long flags;
 246         unsigned long long now;
 247         struct blkio_group_stats *stats;
 248
 249         spin_lock_irqsave(&blkg->stats_lock, flags);
 250         stats = &pd->stats;
 251         if (blkio_blkg_idling(stats)) {
 252                 now = sched_clock();
 253                 if (time_after64(now, stats->start_idle_time))
 254                         stats->idle_time += now - stats->start_idle_time;
 255                 blkio_clear_blkg_idling(stats);
 256         }
 257         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 258 }
 259 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 260
 261 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
 262                                          struct blkio_policy_type *pol)
 263 {
 264         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 265         unsigned long flags;
 266         struct blkio_group_stats *stats;
 267
 268         spin_lock_irqsave(&blkg->stats_lock, flags);
 269         stats = &pd->stats;
 270         stats->avg_queue_size_sum +=
 271                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 272                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 273         stats->avg_queue_size_samples++;
 274         blkio_update_group_wait_time(stats);
 275         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 276 }
 277 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 278
 279 void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 280                                   struct blkio_policy_type *pol)
 281 {
 282         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 283         unsigned long flags;
 284         struct blkio_group_stats *stats;
 285
 286         spin_lock_irqsave(&blkg->stats_lock, flags);
 287         stats = &pd->stats;
 288
 289         if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 290                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
 291                 spin_unlock_irqrestore(&blkg->stats_lock, flags);
 292                 return;
 293         }
 294
 295         /*
 296          * group is already marked empty. This can happen if cfqq got new
 297          * request in parent group and moved to this group while being added
 298          * to service tree. Just ignore the event and move on.
 299          */
 300         if(blkio_blkg_empty(stats)) {
 301                 spin_unlock_irqrestore(&blkg->stats_lock, flags);
 302                 return;
 303         }
 304
 305         stats->start_empty_time = sched_clock();
 306         blkio_mark_blkg_empty(stats);
 307         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 308 }
 309 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 310
 311 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 312                                   struct blkio_policy_type *pol,
 313                                   unsigned long dequeue)
 314 {
 315         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 316
 317         pd->stats.dequeue += dequeue;
 318 }
 319 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 320 #else
 321 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 322                                         struct blkio_policy_type *pol,
 323                                         struct blkio_group *curr_blkg) { }
 324 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
 325 #endif
 326
 327 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 328                                  struct blkio_policy_type *pol,
 329                                  struct blkio_group *curr_blkg, bool direction,
 330                                  bool sync)
 331 {
 332         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 333         unsigned long flags;
 334
 335         spin_lock_irqsave(&blkg->stats_lock, flags);
 336         blkio_add_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 337                         sync);
 338         blkio_end_empty_time(&pd->stats);
 339         blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
 340         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 341 }
 342 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 343
 344 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 345                                     struct blkio_policy_type *pol,
 346                                     bool direction, bool sync)
 347 {
 348         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 349         unsigned long flags;
 350
 351         spin_lock_irqsave(&blkg->stats_lock, flags);
 352         blkio_check_and_dec_stat(pd->stats.stat_arr[BLKIO_STAT_QUEUED],
 353                                         direction, sync);
 354         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 355 }
 356 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 357
 358 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 359                                    struct blkio_policy_type *pol,
 360                                    unsigned long time,
 361                                    unsigned long unaccounted_time)
 362 {
 363         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 364         unsigned long flags;
 365
 366         spin_lock_irqsave(&blkg->stats_lock, flags);
 367         pd->stats.time += time;
 368 #ifdef CONFIG_DEBUG_BLK_CGROUP
 369         pd->stats.unaccounted_time += unaccounted_time;
 370 #endif
 371         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 372 }
 373 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 374
 375 /*
 376  * should be called under rcu read lock or queue lock to make sure blkg pointer
 377  * is valid.
 378  */
 379 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 380                                    struct blkio_policy_type *pol,
 381                                    uint64_t bytes, bool direction, bool sync)
 382 {
 383         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 384         struct blkio_group_stats_cpu *stats_cpu;
 385         unsigned long flags;
 386
 387         /*
 388          * Disabling interrupts to provide mutual exclusion between two
 389          * writes on same cpu. It probably is not needed for 64bit. Not
 390          * optimizing that case yet.
 391          */
 392         local_irq_save(flags);
 393
 394         stats_cpu = this_cpu_ptr(pd->stats_cpu);
 395
 396         u64_stats_update_begin(&stats_cpu->syncp);
 397         stats_cpu->sectors += bytes >> 9;
 398         blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
 399                         1, direction, sync);
 400         blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
 401                         bytes, direction, sync);
 402         u64_stats_update_end(&stats_cpu->syncp);
 403         local_irq_restore(flags);
 404 }
 405 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 406
 407 void blkiocg_update_completion_stats(struct blkio_group *blkg,
 408                                      struct blkio_policy_type *pol,
 409                                      uint64_t start_time,
 410                                      uint64_t io_start_time, bool direction,
 411                                      bool sync)
 412 {
 413         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 414         struct blkio_group_stats *stats;
 415         unsigned long flags;
 416         unsigned long long now = sched_clock();
 417
 418         spin_lock_irqsave(&blkg->stats_lock, flags);
 419         stats = &pd->stats;
 420         if (time_after64(now, io_start_time))
 421                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 422                                 now - io_start_time, direction, sync);
 423         if (time_after64(io_start_time, start_time))
 424                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 425                                 io_start_time - start_time, direction, sync);
 426         spin_unlock_irqrestore(&blkg->stats_lock, flags);
 427 }
 428 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 429
 430 /*  Merged stats are per cpu.  */
 431 void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 432                                     struct blkio_policy_type *pol,
 433                                     bool direction, bool sync)
 434 {
 435         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 436         struct blkio_group_stats_cpu *stats_cpu;
 437         unsigned long flags;
 438
 439         /*
 440          * Disabling interrupts to provide mutual exclusion between two
 441          * writes on same cpu. It probably is not needed for 64bit. Not
 442          * optimizing that case yet.
 443          */
 444         local_irq_save(flags);
 445
 446         stats_cpu = this_cpu_ptr(pd->stats_cpu);
 447
 448         u64_stats_update_begin(&stats_cpu->syncp);
 449         blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
 450                                 direction, sync);
 451         u64_stats_update_end(&stats_cpu->syncp);
 452         local_irq_restore(flags);
 453 }
 454 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 455
 456 /**
 457  * blkg_free - free a blkg
 458  * @blkg: blkg to free
 459  *
 460  * Free @blkg which may be partially allocated.
 461  */
 462 static void blkg_free(struct blkio_group *blkg)
 463 {
 464         struct blkg_policy_data *pd;
 465
 466         if (!blkg)
 467                 return;
 468
 469         pd = blkg->pd[blkg->plid];
 470         if (pd) {
 471                 free_percpu(pd->stats_cpu);
 472                 kfree(pd);
 473         }
 474         kfree(blkg);
 475 }
 476
 477 /**
 478  * blkg_alloc - allocate a blkg
 479  * @blkcg: block cgroup the new blkg is associated with
 480  * @q: request_queue the new blkg is associated with
 481  * @pol: policy the new blkg is associated with
 482  *
 483  * Allocate a new blkg assocating @blkcg and @q for @pol.
 484  *
 485  * FIXME: Should be called with queue locked but currently isn't due to
 486  *        percpu stat breakage.
 487  */
 488 static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 489                                       struct request_queue *q,
 490                                       struct blkio_policy_type *pol)
 491 {
 492         struct blkio_group *blkg;
 493         struct blkg_policy_data *pd;
 494
 495         /* alloc and init base part */
 496         blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
 497         if (!blkg)
 498                 return NULL;
 499
 500         spin_lock_init(&blkg->stats_lock);
 501         rcu_assign_pointer(blkg->q, q);
 502         blkg->blkcg = blkcg;
 503         blkg->plid = pol->plid;
 504         blkg->refcnt = 1;
 505         cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 506
 507         /* alloc per-policy data and attach it to blkg */
 508         pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
 509                           q->node);
 510         if (!pd) {
 511                 blkg_free(blkg);
 512                 return NULL;
 513         }
 514
 515         blkg->pd[pol->plid] = pd;
 516         pd->blkg = blkg;
 517
 518         /* broken, read comment in the callsite */
 519
 520         pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
 521         if (!pd->stats_cpu) {
 522                 blkg_free(blkg);
 523                 return NULL;
 524         }
 525
 526         /* invoke per-policy init */
 527         pol->ops.blkio_init_group_fn(blkg);
 528         return blkg;
 529 }
 530
 531 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 532                                        struct request_queue *q,
 533                                        enum blkio_policy_id plid,
 534                                        bool for_root)
 535         __releases(q->queue_lock) __acquires(q->queue_lock)
 536 {
 537         struct blkio_policy_type *pol = blkio_policy[plid];
 538         struct blkio_group *blkg, *new_blkg;
 539
 540         WARN_ON_ONCE(!rcu_read_lock_held());
 541         lockdep_assert_held(q->queue_lock);
 542
 543         /*
 544          * This could be the first entry point of blkcg implementation and
 545          * we shouldn't allow anything to go through for a bypassing queue.
 546          * The following can be removed if blkg lookup is guaranteed to
 547          * fail on a bypassing queue.
 548          */
 549         if (unlikely(blk_queue_bypass(q)) && !for_root)
 550                 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 551
 552         blkg = blkg_lookup(blkcg, q, plid);
 553         if (blkg)
 554                 return blkg;
 555
 556         /* blkg holds a reference to blkcg */
 557         if (!css_tryget(&blkcg->css))
 558                 return ERR_PTR(-EINVAL);
 559
 560         /*
 561          * Allocate and initialize.
 562          *
 563          * FIXME: The following is broken.  Percpu memory allocation
 564          * requires %GFP_KERNEL context and can't be performed from IO
 565          * path.  Allocation here should inherently be atomic and the
 566          * following lock dancing can be removed once the broken percpu
 567          * allocation is fixed.
 568          */
 569         spin_unlock_irq(q->queue_lock);
 570         rcu_read_unlock();
 571
 572         new_blkg = blkg_alloc(blkcg, q, pol);
 573
 574         rcu_read_lock();
 575         spin_lock_irq(q->queue_lock);
 576
 577         /* did bypass get turned on inbetween? */
 578         if (unlikely(blk_queue_bypass(q)) && !for_root) {
 579                 blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 580                 goto out;
 581         }
 582
 583         /* did someone beat us to it? */
 584         blkg = blkg_lookup(blkcg, q, plid);
 585         if (unlikely(blkg))
 586                 goto out;
 587
 588         /* did alloc fail? */
 589         if (unlikely(!new_blkg)) {
 590                 blkg = ERR_PTR(-ENOMEM);
 591                 goto out;
 592         }
 593
 594         /* insert */
 595         spin_lock(&blkcg->lock);
 596         swap(blkg, new_blkg);
 597         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 598         pol->ops.blkio_link_group_fn(q, blkg);
 599         spin_unlock(&blkcg->lock);
 600 out:
 601         blkg_free(new_blkg);
 602         return blkg;
 603 }
 604 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 605
 606 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 607 {
 608         hlist_del_init_rcu(&blkg->blkcg_node);
 609 }
 610
 611 /*
 612  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
 613  * indicating that blk_group was unhashed by the time we got to it.
 614  */
 615 int blkiocg_del_blkio_group(struct blkio_group *blkg)
 616 {
 617         struct blkio_cgroup *blkcg = blkg->blkcg;
 618         unsigned long flags;
 619         int ret = 1;
 620
 621         spin_lock_irqsave(&blkcg->lock, flags);
 622         if (!hlist_unhashed(&blkg->blkcg_node)) {
 623                 __blkiocg_del_blkio_group(blkg);
 624                 ret = 0;
 625         }
 626         spin_unlock_irqrestore(&blkcg->lock, flags);
 627
 628         return ret;
 629 }
 630 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 631
 632 /* called under rcu_read_lock(). */
 633 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 634                                 struct request_queue *q,
 635                                 enum blkio_policy_id plid)
 636 {
 637         struct blkio_group *blkg;
 638         struct hlist_node *n;
 639
 640         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
 641                 if (blkg->q == q && blkg->plid == plid)
 642                         return blkg;
 643         return NULL;
 644 }
 645 EXPORT_SYMBOL_GPL(blkg_lookup);
 646
 647 void blkg_destroy_all(struct request_queue *q)
 648 {
 649         struct blkio_policy_type *pol;
 650
 651         while (true) {
 652                 bool done = true;
 653
 654                 spin_lock(&blkio_list_lock);
 655                 spin_lock_irq(q->queue_lock);
 656
 657                 /*
 658                  * clear_queue_fn() might return with non-empty group list
 659                  * if it raced cgroup removal and lost.  cgroup removal is
 660                  * guaranteed to make forward progress and retrying after a
 661                  * while is enough.  This ugliness is scheduled to be
 662                  * removed after locking update.
 663                  */
 664                 list_for_each_entry(pol, &blkio_list, list)
 665                         if (!pol->ops.blkio_clear_queue_fn(q))
 666                                 done = false;
 667
 668                 spin_unlock_irq(q->queue_lock);
 669                 spin_unlock(&blkio_list_lock);
 670
 671                 if (done)
 672                         break;
 673
 674                 msleep(10);     /* just some random duration I like */
 675         }
 676 }
 677
 678 static void blkg_rcu_free(struct rcu_head *rcu_head)
 679 {
 680         blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
 681 }
 682
 683 void __blkg_release(struct blkio_group *blkg)
 684 {
 685         /* release the extra blkcg reference this blkg has been holding */
 686         css_put(&blkg->blkcg->css);
 687
 688         /*
 689          * A group is freed in rcu manner. But having an rcu lock does not
 690          * mean that one can access all the fields of blkg and assume these
 691          * are valid. For example, don't try to follow throtl_data and
 692          * request queue links.
 693          *
 694          * Having a reference to blkg under an rcu allows acess to only
 695          * values local to groups like group stats and group rate limits
 696          */
 697         call_rcu(&blkg->rcu_head, blkg_rcu_free);
 698 }
 699 EXPORT_SYMBOL_GPL(__blkg_release);
 700
 701 static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 702 {
 703         struct blkg_policy_data *pd = blkg->pd[plid];
 704         struct blkio_group_stats_cpu *stats_cpu;
 705         int i, j, k;
 706         /*
 707          * Note: On 64 bit arch this should not be an issue. This has the
 708          * possibility of returning some inconsistent value on 32bit arch
 709          * as 64bit update on 32bit is non atomic. Taking care of this
 710          * corner case makes code very complicated, like sending IPIs to
 711          * cpus, taking care of stats of offline cpus etc.
 712          *
 713          * reset stats is anyway more of a debug feature and this sounds a
 714          * corner case. So I am not complicating the code yet until and
 715          * unless this becomes a real issue.
 716          */
 717         for_each_possible_cpu(i) {
 718                 stats_cpu = per_cpu_ptr(pd->stats_cpu, i);
 719                 stats_cpu->sectors = 0;
 720                 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
 721                         for (k = 0; k < BLKIO_STAT_TOTAL; k++)
 722                                 stats_cpu->stat_arr_cpu[j][k] = 0;
 723         }
 724 }
 725
 726 static int
 727 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 728 {
 729         struct blkio_cgroup *blkcg;
 730         struct blkio_group *blkg;
 731         struct blkio_group_stats *stats;
 732         struct hlist_node *n;
 733         uint64_t queued[BLKIO_STAT_TOTAL];
 734         int i;
 735 #ifdef CONFIG_DEBUG_BLK_CGROUP
 736         bool idling, waiting, empty;
 737         unsigned long long now = sched_clock();
 738 #endif
 739
 740         blkcg = cgroup_to_blkio_cgroup(cgroup);
 741         spin_lock_irq(&blkcg->lock);
 742         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 743                 struct blkg_policy_data *pd = blkg->pd[blkg->plid];
 744
 745                 spin_lock(&blkg->stats_lock);
 746                 stats = &pd->stats;
 747 #ifdef CONFIG_DEBUG_BLK_CGROUP
 748                 idling = blkio_blkg_idling(stats);
 749                 waiting = blkio_blkg_waiting(stats);
 750                 empty = blkio_blkg_empty(stats);
 751 #endif
 752                 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 753                         queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
 754                 memset(stats, 0, sizeof(struct blkio_group_stats));
 755                 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 756                         stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 757 #ifdef CONFIG_DEBUG_BLK_CGROUP
 758                 if (idling) {
 759                         blkio_mark_blkg_idling(stats);
 760                         stats->start_idle_time = now;
 761                 }
 762                 if (waiting) {
 763                         blkio_mark_blkg_waiting(stats);
 764                         stats->start_group_wait_time = now;
 765                 }
 766                 if (empty) {
 767                         blkio_mark_blkg_empty(stats);
 768                         stats->start_empty_time = now;
 769                 }
 770 #endif
 771                 spin_unlock(&blkg->stats_lock);
 772
 773                 /* Reset Per cpu stats which don't take blkg->stats_lock */
 774                 blkio_reset_stats_cpu(blkg, blkg->plid);
 775         }
 776
 777         spin_unlock_irq(&blkcg->lock);
 778         return 0;
 779 }
 780
 781 static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
 782                                char *str, int chars_left, bool diskname_only)
 783 {
 784         snprintf(str, chars_left, "%s", dname);
 785         chars_left -= strlen(str);
 786         if (chars_left <= 0) {
 787                 printk(KERN_WARNING
 788                         "Possibly incorrect cgroup stat display format");
 789                 return;
 790         }
 791         if (diskname_only)
 792                 return;
 793         switch (type) {
 794         case BLKIO_STAT_READ:
 795                 strlcat(str, " Read", chars_left);
 796                 break;
 797         case BLKIO_STAT_WRITE:
 798                 strlcat(str, " Write", chars_left);
 799                 break;
 800         case BLKIO_STAT_SYNC:
 801                 strlcat(str, " Sync", chars_left);
 802                 break;
 803         case BLKIO_STAT_ASYNC:
 804                 strlcat(str, " Async", chars_left);
 805                 break;
 806         case BLKIO_STAT_TOTAL:
 807                 strlcat(str, " Total", chars_left);
 808                 break;
 809         default:
 810                 strlcat(str, " Invalid", chars_left);
 811         }
 812 }
 813
 814 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 815                                 struct cgroup_map_cb *cb, const char *dname)
 816 {
 817         blkio_get_key_name(0, dname, str, chars_left, true);
 818         cb->fill(cb, str, val);
 819         return val;
 820 }
 821
 822
 823 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
 824                         enum stat_type_cpu type, enum stat_sub_type sub_type)
 825 {
 826         struct blkg_policy_data *pd = blkg->pd[plid];
 827         int cpu;
 828         struct blkio_group_stats_cpu *stats_cpu;
 829         u64 val = 0, tval;
 830
 831         for_each_possible_cpu(cpu) {
 832                 unsigned int start;
 833                 stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
 834
 835                 do {
 836                         start = u64_stats_fetch_begin(&stats_cpu->syncp);
 837                         if (type == BLKIO_STAT_CPU_SECTORS)
 838                                 tval = stats_cpu->sectors;
 839                         else
 840                                 tval = stats_cpu->stat_arr_cpu[type][sub_type];
 841                 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
 842
 843                 val += tval;
 844         }
 845
 846         return val;
 847 }
 848
 849 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 850                                    struct cgroup_map_cb *cb, const char *dname,
 851                                    enum stat_type_cpu type)
 852 {
 853         uint64_t disk_total, val;
 854         char key_str[MAX_KEY_LEN];
 855         enum stat_sub_type sub_type;
 856
 857         if (type == BLKIO_STAT_CPU_SECTORS) {
 858                 val = blkio_read_stat_cpu(blkg, plid, type, 0);
 859                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
 860                                        dname);
 861         }
 862
 863         for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 864                         sub_type++) {
 865                 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 866                                    false);
 867                 val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
 868                 cb->fill(cb, key_str, val);
 869         }
 870
 871         disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
 872                 blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
 873
 874         blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 875                            false);
 876         cb->fill(cb, key_str, disk_total);
 877         return disk_total;
 878 }
 879
 880 /* This should be called with blkg->stats_lock held */
 881 static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 882                                struct cgroup_map_cb *cb, const char *dname,
 883                                enum stat_type type)
 884 {
 885         struct blkg_policy_data *pd = blkg->pd[plid];
 886         uint64_t disk_total;
 887         char key_str[MAX_KEY_LEN];
 888         enum stat_sub_type sub_type;
 889
 890         if (type == BLKIO_STAT_TIME)
 891                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 892                                         pd->stats.time, cb, dname);
 893 #ifdef CONFIG_DEBUG_BLK_CGROUP
 894         if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 895                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 896                                        pd->stats.unaccounted_time, cb, dname);
 897         if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 898                 uint64_t sum = pd->stats.avg_queue_size_sum;
 899                 uint64_t samples = pd->stats.avg_queue_size_samples;
 900                 if (samples)
 901                         do_div(sum, samples);
 902                 else
 903                         sum = 0;
 904                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 905                                        sum, cb, dname);
 906         }
 907         if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 908                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 909                                        pd->stats.group_wait_time, cb, dname);
 910         if (type == BLKIO_STAT_IDLE_TIME)
 911                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 912                                        pd->stats.idle_time, cb, dname);
 913         if (type == BLKIO_STAT_EMPTY_TIME)
 914                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 915                                        pd->stats.empty_time, cb, dname);
 916         if (type == BLKIO_STAT_DEQUEUE)
 917                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 918                                        pd->stats.dequeue, cb, dname);
 919 #endif
 920
 921         for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 922                         sub_type++) {
 923                 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 924                                    false);
 925                 cb->fill(cb, key_str, pd->stats.stat_arr[type][sub_type]);
 926         }
 927         disk_total = pd->stats.stat_arr[type][BLKIO_STAT_READ] +
 928                         pd->stats.stat_arr[type][BLKIO_STAT_WRITE];
 929         blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 930                            false);
 931         cb->fill(cb, key_str, disk_total);
 932         return disk_total;
 933 }
 934
 935 static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 936                                       int fileid, struct blkio_cgroup *blkcg)
 937 {
 938         struct gendisk *disk = NULL;
 939         struct blkio_group *blkg = NULL;
 940         struct blkg_policy_data *pd;
 941         char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 942         unsigned long major, minor;
 943         int i = 0, ret = -EINVAL;
 944         int part;
 945         dev_t dev;
 946         u64 temp;
 947
 948         memset(s, 0, sizeof(s));
 949
 950         while ((p = strsep(&buf, " ")) != NULL) {
 951                 if (!*p)
 952                         continue;
 953
 954                 s[i++] = p;
 955
 956                 /* Prevent from inputing too many things */
 957                 if (i == 3)
 958                         break;
 959         }
 960
 961         if (i != 2)
 962                 goto out;
 963
 964         p = strsep(&s[0], ":");
 965         if (p != NULL)
 966                 major_s = p;
 967         else
 968                 goto out;
 969
 970         minor_s = s[0];
 971         if (!minor_s)
 972                 goto out;
 973
 974         if (strict_strtoul(major_s, 10, &major))
 975                 goto out;
 976
 977         if (strict_strtoul(minor_s, 10, &minor))
 978                 goto out;
 979
 980         dev = MKDEV(major, minor);
 981
 982         if (strict_strtoull(s[1], 10, &temp))
 983                 goto out;
 984
 985         disk = get_gendisk(dev, &part);
 986         if (!disk || part)
 987                 goto out;
 988
 989         rcu_read_lock();
 990
 991         spin_lock_irq(disk->queue->queue_lock);
 992         blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
 993         spin_unlock_irq(disk->queue->queue_lock);
 994
 995         if (IS_ERR(blkg)) {
 996                 ret = PTR_ERR(blkg);
 997                 goto out_unlock;
 998         }
 999
1000         pd = blkg->pd[plid];
1001
1002         switch (plid) {
1003         case BLKIO_POLICY_PROP:
1004                 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
1005                      temp > BLKIO_WEIGHT_MAX)
1006                         goto out_unlock;
1007
1008                 pd->conf.weight = temp;
1009                 blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
1010                 break;
1011         case BLKIO_POLICY_THROTL:
1012                 switch(fileid) {
1013                 case BLKIO_THROTL_read_bps_device:
1014                         pd->conf.bps[READ] = temp;
1015                         blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1016                         break;
1017                 case BLKIO_THROTL_write_bps_device:
1018                         pd->conf.bps[WRITE] = temp;
1019                         blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1020                         break;
1021                 case BLKIO_THROTL_read_iops_device:
1022                         if (temp > THROTL_IOPS_MAX)
1023                                 goto out_unlock;
1024                         pd->conf.iops[READ] = temp;
1025                         blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1026                         break;
1027                 case BLKIO_THROTL_write_iops_device:
1028                         if (temp > THROTL_IOPS_MAX)
1029                                 goto out_unlock;
1030                         pd->conf.iops[WRITE] = temp;
1031                         blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1032                         break;
1033                 }
1034                 break;
1035         default:
1036                 BUG();
1037         }
1038         ret = 0;
1039 out_unlock:
1040         rcu_read_unlock();
1041 out:
1042         put_disk(disk);
1043
1044         /*
1045          * If queue was bypassing, we should retry.  Do so after a short
1046          * msleep().  It isn't strictly necessary but queue can be
1047          * bypassing for some time and it's always nice to avoid busy
1048          * looping.
1049          */
1050         if (ret == -EBUSY) {
1051                 msleep(10);
1052                 return restart_syscall();
1053         }
1054         return ret;
1055 }
1056
1057 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1058                                        const char *buffer)
1059 {
1060         int ret = 0;
1061         char *buf;
1062         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1063         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1064         int fileid = BLKIOFILE_ATTR(cft->private);
1065
1066         buf = kstrdup(buffer, GFP_KERNEL);
1067         if (!buf)
1068                 return -ENOMEM;
1069
1070         ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
1071         kfree(buf);
1072         return ret;
1073 }
1074
1075 static const char *blkg_dev_name(struct blkio_group *blkg)
1076 {
1077         /* some drivers (floppy) instantiate a queue w/o disk registered */
1078         if (blkg->q->backing_dev_info.dev)
1079                 return dev_name(blkg->q->backing_dev_info.dev);
1080         return NULL;
1081 }
1082
1083 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1084                                    struct seq_file *m)
1085 {
1086         int plid = BLKIOFILE_POLICY(cft->private);
1087         int fileid = BLKIOFILE_ATTR(cft->private);
1088         struct blkg_policy_data *pd = blkg->pd[plid];
1089         const char *dname = blkg_dev_name(blkg);
1090         int rw = WRITE;
1091
1092         if (!dname)
1093                 return;
1094
1095         switch (plid) {
1096                 case BLKIO_POLICY_PROP:
1097                         if (pd->conf.weight)
1098                                 seq_printf(m, "%s\t%u\n",
1099                                            dname, pd->conf.weight);
1100                         break;
1101                 case BLKIO_POLICY_THROTL:
1102                         switch (fileid) {
1103                         case BLKIO_THROTL_read_bps_device:
1104                                 rw = READ;
1105                         case BLKIO_THROTL_write_bps_device:
1106                                 if (pd->conf.bps[rw])
1107                                         seq_printf(m, "%s\t%llu\n",
1108                                                    dname, pd->conf.bps[rw]);
1109                                 break;
1110                         case BLKIO_THROTL_read_iops_device:
1111                                 rw = READ;
1112                         case BLKIO_THROTL_write_iops_device:
1113                                 if (pd->conf.iops[rw])
1114                                         seq_printf(m, "%s\t%u\n",
1115                                                    dname, pd->conf.iops[rw]);
1116                                 break;
1117                         }
1118                         break;
1119                 default:
1120                         BUG();
1121         }
1122 }
1123
1124 /* cgroup files which read their data from policy nodes end up here */
1125 static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1126                             struct seq_file *m)
1127 {
1128         struct blkio_group *blkg;
1129         struct hlist_node *n;
1130
1131         spin_lock_irq(&blkcg->lock);
1132         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1133                 if (BLKIOFILE_POLICY(cft->private) == blkg->plid)
1134                         blkio_print_group_conf(cft, blkg, m);
1135         spin_unlock_irq(&blkcg->lock);
1136 }
1137
1138 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1139                                 struct seq_file *m)
1140 {
1141         struct blkio_cgroup *blkcg;
1142         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1143         int name = BLKIOFILE_ATTR(cft->private);
1144
1145         blkcg = cgroup_to_blkio_cgroup(cgrp);
1146
1147         switch(plid) {
1148         case BLKIO_POLICY_PROP:
1149                 switch(name) {
1150                 case BLKIO_PROP_weight_device:
1151                         blkio_read_conf(cft, blkcg, m);
1152                         return 0;
1153                 default:
1154                         BUG();
1155                 }
1156                 break;
1157         case BLKIO_POLICY_THROTL:
1158                 switch(name){
1159                 case BLKIO_THROTL_read_bps_device:
1160                 case BLKIO_THROTL_write_bps_device:
1161                 case BLKIO_THROTL_read_iops_device:
1162                 case BLKIO_THROTL_write_iops_device:
1163                         blkio_read_conf(cft, blkcg, m);
1164                         return 0;
1165                 default:
1166                         BUG();
1167                 }
1168                 break;
1169         default:
1170                 BUG();
1171         }
1172
1173         return 0;
1174 }
1175
1176 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1177                 struct cftype *cft, struct cgroup_map_cb *cb,
1178                 enum stat_type type, bool show_total, bool pcpu)
1179 {
1180         struct blkio_group *blkg;
1181         struct hlist_node *n;
1182         uint64_t cgroup_total = 0;
1183
1184         rcu_read_lock();
1185         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1186                 const char *dname = blkg_dev_name(blkg);
1187                 int plid = BLKIOFILE_POLICY(cft->private);
1188
1189                 if (!dname || plid != blkg->plid)
1190                         continue;
1191                 if (pcpu) {
1192                         cgroup_total += blkio_get_stat_cpu(blkg, plid,
1193                                                            cb, dname, type);
1194                 } else {
1195                         spin_lock_irq(&blkg->stats_lock);
1196                         cgroup_total += blkio_get_stat(blkg, plid,
1197                                                        cb, dname, type);
1198                         spin_unlock_irq(&blkg->stats_lock);
1199                 }
1200         }
1201         if (show_total)
1202                 cb->fill(cb, "Total", cgroup_total);
1203         rcu_read_unlock();
1204         return 0;
1205 }
1206
1207 /* All map kind of cgroup file get serviced by this function */
1208 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1209                                 struct cgroup_map_cb *cb)
1210 {
1211         struct blkio_cgroup *blkcg;
1212         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1213         int name = BLKIOFILE_ATTR(cft->private);
1214
1215         blkcg = cgroup_to_blkio_cgroup(cgrp);
1216
1217         switch(plid) {
1218         case BLKIO_POLICY_PROP:
1219                 switch(name) {
1220                 case BLKIO_PROP_time:
1221                         return blkio_read_blkg_stats(blkcg, cft, cb,
1222                                                 BLKIO_STAT_TIME, 0, 0);
1223                 case BLKIO_PROP_sectors:
1224                         return blkio_read_blkg_stats(blkcg, cft, cb,
1225                                                 BLKIO_STAT_CPU_SECTORS, 0, 1);
1226                 case BLKIO_PROP_io_service_bytes:
1227                         return blkio_read_blkg_stats(blkcg, cft, cb,
1228                                         BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1229                 case BLKIO_PROP_io_serviced:
1230                         return blkio_read_blkg_stats(blkcg, cft, cb,
1231                                                 BLKIO_STAT_CPU_SERVICED, 1, 1);
1232                 case BLKIO_PROP_io_service_time:
1233                         return blkio_read_blkg_stats(blkcg, cft, cb,
1234                                                 BLKIO_STAT_SERVICE_TIME, 1, 0);
1235                 case BLKIO_PROP_io_wait_time:
1236                         return blkio_read_blkg_stats(blkcg, cft, cb,
1237                                                 BLKIO_STAT_WAIT_TIME, 1, 0);
1238                 case BLKIO_PROP_io_merged:
1239                         return blkio_read_blkg_stats(blkcg, cft, cb,
1240                                                 BLKIO_STAT_CPU_MERGED, 1, 1);
1241                 case BLKIO_PROP_io_queued:
1242                         return blkio_read_blkg_stats(blkcg, cft, cb,
1243                                                 BLKIO_STAT_QUEUED, 1, 0);
1244 #ifdef CONFIG_DEBUG_BLK_CGROUP
1245                 case BLKIO_PROP_unaccounted_time:
1246                         return blkio_read_blkg_stats(blkcg, cft, cb,
1247                                         BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1248                 case BLKIO_PROP_dequeue:
1249                         return blkio_read_blkg_stats(blkcg, cft, cb,
1250                                                 BLKIO_STAT_DEQUEUE, 0, 0);
1251                 case BLKIO_PROP_avg_queue_size:
1252                         return blkio_read_blkg_stats(blkcg, cft, cb,
1253                                         BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1254                 case BLKIO_PROP_group_wait_time:
1255                         return blkio_read_blkg_stats(blkcg, cft, cb,
1256                                         BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1257                 case BLKIO_PROP_idle_time:
1258                         return blkio_read_blkg_stats(blkcg, cft, cb,
1259                                                 BLKIO_STAT_IDLE_TIME, 0, 0);
1260                 case BLKIO_PROP_empty_time:
1261                         return blkio_read_blkg_stats(blkcg, cft, cb,
1262                                                 BLKIO_STAT_EMPTY_TIME, 0, 0);
1263 #endif
1264                 default:
1265                         BUG();
1266                 }
1267                 break;
1268         case BLKIO_POLICY_THROTL:
1269                 switch(name){
1270                 case BLKIO_THROTL_io_service_bytes:
1271                         return blkio_read_blkg_stats(blkcg, cft, cb,
1272                                                 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1273                 case BLKIO_THROTL_io_serviced:
1274                         return blkio_read_blkg_stats(blkcg, cft, cb,
1275                                                 BLKIO_STAT_CPU_SERVICED, 1, 1);
1276                 default:
1277                         BUG();
1278                 }
1279                 break;
1280         default:
1281                 BUG();
1282         }
1283
1284         return 0;
1285 }
1286
1287 static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
1288 {
1289         struct blkio_group *blkg;
1290         struct hlist_node *n;
1291
1292         if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1293                 return -EINVAL;
1294
1295         spin_lock(&blkio_list_lock);
1296         spin_lock_irq(&blkcg->lock);
1297         blkcg->weight = (unsigned int)val;
1298
1299         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1300                 struct blkg_policy_data *pd = blkg->pd[blkg->plid];
1301
1302                 if (blkg->plid == plid && !pd->conf.weight)
1303                         blkio_update_group_weight(blkg, plid, blkcg->weight);
1304         }
1305
1306         spin_unlock_irq(&blkcg->lock);
1307         spin_unlock(&blkio_list_lock);
1308         return 0;
1309 }
1310
1311 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1312         struct blkio_cgroup *blkcg;
1313         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1314         int name = BLKIOFILE_ATTR(cft->private);
1315
1316         blkcg = cgroup_to_blkio_cgroup(cgrp);
1317
1318         switch(plid) {
1319         case BLKIO_POLICY_PROP:
1320                 switch(name) {
1321                 case BLKIO_PROP_weight:
1322                         return (u64)blkcg->weight;
1323                 }
1324                 break;
1325         default:
1326                 BUG();
1327         }
1328         return 0;
1329 }
1330
1331 static int
1332 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1333 {
1334         struct blkio_cgroup *blkcg;
1335         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1336         int name = BLKIOFILE_ATTR(cft->private);
1337
1338         blkcg = cgroup_to_blkio_cgroup(cgrp);
1339
1340         switch(plid) {
1341         case BLKIO_POLICY_PROP:
1342                 switch(name) {
1343                 case BLKIO_PROP_weight:
1344                         return blkio_weight_write(blkcg, plid, val);
1345                 }
1346                 break;
1347         default:
1348                 BUG();
1349         }
1350
1351         return 0;
1352 }
1353
1354 struct cftype blkio_files[] = {
1355         {
1356                 .name = "weight_device",
1357                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358                                 BLKIO_PROP_weight_device),
1359                 .read_seq_string = blkiocg_file_read,
1360                 .write_string = blkiocg_file_write,
1361                 .max_write_len = 256,
1362         },
1363         {
1364                 .name = "weight",
1365                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1366                                 BLKIO_PROP_weight),
1367                 .read_u64 = blkiocg_file_read_u64,
1368                 .write_u64 = blkiocg_file_write_u64,
1369         },
1370         {
1371                 .name = "time",
1372                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1373                                 BLKIO_PROP_time),
1374                 .read_map = blkiocg_file_read_map,
1375         },
1376         {
1377                 .name = "sectors",
1378                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1379                                 BLKIO_PROP_sectors),
1380                 .read_map = blkiocg_file_read_map,
1381         },
1382         {
1383                 .name = "io_service_bytes",
1384                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1385                                 BLKIO_PROP_io_service_bytes),
1386                 .read_map = blkiocg_file_read_map,
1387         },
1388         {
1389                 .name = "io_serviced",
1390                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1391                                 BLKIO_PROP_io_serviced),
1392                 .read_map = blkiocg_file_read_map,
1393         },
1394         {
1395                 .name = "io_service_time",
1396                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1397                                 BLKIO_PROP_io_service_time),
1398                 .read_map = blkiocg_file_read_map,
1399         },
1400         {
1401                 .name = "io_wait_time",
1402                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1403                                 BLKIO_PROP_io_wait_time),
1404                 .read_map = blkiocg_file_read_map,
1405         },
1406         {
1407                 .name = "io_merged",
1408                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1409                                 BLKIO_PROP_io_merged),
1410                 .read_map = blkiocg_file_read_map,
1411         },
1412         {
1413                 .name = "io_queued",
1414                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1415                                 BLKIO_PROP_io_queued),
1416                 .read_map = blkiocg_file_read_map,
1417         },
1418         {
1419                 .name = "reset_stats",
1420                 .write_u64 = blkiocg_reset_stats,
1421         },
1422 #ifdef CONFIG_BLK_DEV_THROTTLING
1423         {
1424                 .name = "throttle.read_bps_device",
1425                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1426                                 BLKIO_THROTL_read_bps_device),
1427                 .read_seq_string = blkiocg_file_read,
1428                 .write_string = blkiocg_file_write,
1429                 .max_write_len = 256,
1430         },
1431
1432         {
1433                 .name = "throttle.write_bps_device",
1434                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1435                                 BLKIO_THROTL_write_bps_device),
1436                 .read_seq_string = blkiocg_file_read,
1437                 .write_string = blkiocg_file_write,
1438                 .max_write_len = 256,
1439         },
1440
1441         {
1442                 .name = "throttle.read_iops_device",
1443                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1444                                 BLKIO_THROTL_read_iops_device),
1445                 .read_seq_string = blkiocg_file_read,
1446                 .write_string = blkiocg_file_write,
1447                 .max_write_len = 256,
1448         },
1449
1450         {
1451                 .name = "throttle.write_iops_device",
1452                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1453                                 BLKIO_THROTL_write_iops_device),
1454                 .read_seq_string = blkiocg_file_read,
1455                 .write_string = blkiocg_file_write,
1456                 .max_write_len = 256,
1457         },
1458         {
1459                 .name = "throttle.io_service_bytes",
1460                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1461                                 BLKIO_THROTL_io_service_bytes),
1462                 .read_map = blkiocg_file_read_map,
1463         },
1464         {
1465                 .name = "throttle.io_serviced",
1466                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1467                                 BLKIO_THROTL_io_serviced),
1468                 .read_map = blkiocg_file_read_map,
1469         },
1470 #endif /* CONFIG_BLK_DEV_THROTTLING */
1471
1472 #ifdef CONFIG_DEBUG_BLK_CGROUP
1473         {
1474                 .name = "avg_queue_size",
1475                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1476                                 BLKIO_PROP_avg_queue_size),
1477                 .read_map = blkiocg_file_read_map,
1478         },
1479         {
1480                 .name = "group_wait_time",
1481                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1482                                 BLKIO_PROP_group_wait_time),
1483                 .read_map = blkiocg_file_read_map,
1484         },
1485         {
1486                 .name = "idle_time",
1487                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1488                                 BLKIO_PROP_idle_time),
1489                 .read_map = blkiocg_file_read_map,
1490         },
1491         {
1492                 .name = "empty_time",
1493                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1494                                 BLKIO_PROP_empty_time),
1495                 .read_map = blkiocg_file_read_map,
1496         },
1497         {
1498                 .name = "dequeue",
1499                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1500                                 BLKIO_PROP_dequeue),
1501                 .read_map = blkiocg_file_read_map,
1502         },
1503         {
1504                 .name = "unaccounted_time",
1505                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1506                                 BLKIO_PROP_unaccounted_time),
1507                 .read_map = blkiocg_file_read_map,
1508         },
1509 #endif
1510 };
1511
1512 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1513 {
1514         return cgroup_add_files(cgroup, subsys, blkio_files,
1515                                 ARRAY_SIZE(blkio_files));
1516 }
1517
1518 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1519                                struct cgroup *cgroup)
1520 {
1521         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1522         unsigned long flags;
1523         struct blkio_group *blkg;
1524         struct request_queue *q;
1525         struct blkio_policy_type *blkiop;
1526
1527         rcu_read_lock();
1528
1529         do {
1530                 spin_lock_irqsave(&blkcg->lock, flags);
1531
1532                 if (hlist_empty(&blkcg->blkg_list)) {
1533                         spin_unlock_irqrestore(&blkcg->lock, flags);
1534                         break;
1535                 }
1536
1537                 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1538                                         blkcg_node);
1539                 q = rcu_dereference(blkg->q);
1540                 __blkiocg_del_blkio_group(blkg);
1541
1542                 spin_unlock_irqrestore(&blkcg->lock, flags);
1543
1544                 /*
1545                  * This blkio_group is being unlinked as associated cgroup is
1546                  * going away. Let all the IO controlling policies know about
1547                  * this event.
1548                  */
1549                 spin_lock(&blkio_list_lock);
1550                 list_for_each_entry(blkiop, &blkio_list, list) {
1551                         if (blkiop->plid != blkg->plid)
1552                                 continue;
1553                         blkiop->ops.blkio_unlink_group_fn(q, blkg);
1554                 }
1555                 spin_unlock(&blkio_list_lock);
1556         } while (1);
1557
1558         rcu_read_unlock();
1559
1560         return 0;
1561 }
1562
1563 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1564 {
1565         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1566
1567         if (blkcg != &blkio_root_cgroup)
1568                 kfree(blkcg);
1569 }
1570
1571 static struct cgroup_subsys_state *
1572 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1573 {
1574         struct blkio_cgroup *blkcg;
1575         struct cgroup *parent = cgroup->parent;
1576
1577         if (!parent) {
1578                 blkcg = &blkio_root_cgroup;
1579                 goto done;
1580         }
1581
1582         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1583         if (!blkcg)
1584                 return ERR_PTR(-ENOMEM);
1585
1586         blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1587 done:
1588         spin_lock_init(&blkcg->lock);
1589         INIT_HLIST_HEAD(&blkcg->blkg_list);
1590
1591         return &blkcg->css;
1592 }
1593
1594 /**
1595  * blkcg_init_queue - initialize blkcg part of request queue
1596  * @q: request_queue to initialize
1597  *
1598  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1599  * part of new request_queue @q.
1600  *
1601  * RETURNS:
1602  * 0 on success, -errno on failure.
1603  */
1604 int blkcg_init_queue(struct request_queue *q)
1605 {
1606         int ret;
1607
1608         might_sleep();
1609
1610         ret = blk_throtl_init(q);
1611         if (ret)
1612                 return ret;
1613
1614         mutex_lock(&all_q_mutex);
1615         INIT_LIST_HEAD(&q->all_q_node);
1616         list_add_tail(&q->all_q_node, &all_q_list);
1617         mutex_unlock(&all_q_mutex);
1618
1619         return 0;
1620 }
1621
1622 /**
1623  * blkcg_drain_queue - drain blkcg part of request_queue
1624  * @q: request_queue to drain
1625  *
1626  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1627  */
1628 void blkcg_drain_queue(struct request_queue *q)
1629 {
1630         lockdep_assert_held(q->queue_lock);
1631
1632         blk_throtl_drain(q);
1633 }
1634
1635 /**
1636  * blkcg_exit_queue - exit and release blkcg part of request_queue
1637  * @q: request_queue being released
1638  *
1639  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1640  */
1641 void blkcg_exit_queue(struct request_queue *q)
1642 {
1643         mutex_lock(&all_q_mutex);
1644         list_del_init(&q->all_q_node);
1645         mutex_unlock(&all_q_mutex);
1646
1647         blk_throtl_exit(q);
1648 }
1649
1650 /*
1651  * We cannot support shared io contexts, as we have no mean to support
1652  * two tasks with the same ioc in two different groups without major rework
1653  * of the main cic data structures.  For now we allow a task to change
1654  * its cgroup only if it's the only owner of its ioc.
1655  */
1656 static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1657                               struct cgroup_taskset *tset)
1658 {
1659         struct task_struct *task;
1660         struct io_context *ioc;
1661         int ret = 0;
1662
1663         /* task_lock() is needed to avoid races with exit_io_context() */
1664         cgroup_taskset_for_each(task, cgrp, tset) {
1665                 task_lock(task);
1666                 ioc = task->io_context;
1667                 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1668                         ret = -EINVAL;
1669                 task_unlock(task);
1670                 if (ret)
1671                         break;
1672         }
1673         return ret;
1674 }
1675
1676 static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1677                            struct cgroup_taskset *tset)
1678 {
1679         struct task_struct *task;
1680         struct io_context *ioc;
1681
1682         cgroup_taskset_for_each(task, cgrp, tset) {
1683                 /* we don't lose anything even if ioc allocation fails */
1684                 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1685                 if (ioc) {
1686                         ioc_cgroup_changed(ioc);
1687                         put_io_context(ioc);
1688                 }
1689         }
1690 }
1691
1692 static void blkcg_bypass_start(void)
1693         __acquires(&all_q_mutex)
1694 {
1695         struct request_queue *q;
1696
1697         mutex_lock(&all_q_mutex);
1698
1699         list_for_each_entry(q, &all_q_list, all_q_node) {
1700                 blk_queue_bypass_start(q);
1701                 blkg_destroy_all(q);
1702         }
1703 }
1704
1705 static void blkcg_bypass_end(void)
1706         __releases(&all_q_mutex)
1707 {
1708         struct request_queue *q;
1709
1710         list_for_each_entry(q, &all_q_list, all_q_node)
1711                 blk_queue_bypass_end(q);
1712
1713         mutex_unlock(&all_q_mutex);
1714 }
1715
1716 void blkio_policy_register(struct blkio_policy_type *blkiop)
1717 {
1718         blkcg_bypass_start();
1719         spin_lock(&blkio_list_lock);
1720
1721         BUG_ON(blkio_policy[blkiop->plid]);
1722         blkio_policy[blkiop->plid] = blkiop;
1723         list_add_tail(&blkiop->list, &blkio_list);
1724
1725         spin_unlock(&blkio_list_lock);
1726         blkcg_bypass_end();
1727 }
1728 EXPORT_SYMBOL_GPL(blkio_policy_register);
1729
1730 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1731 {
1732         blkcg_bypass_start();
1733         spin_lock(&blkio_list_lock);
1734
1735         BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1736         blkio_policy[blkiop->plid] = NULL;
1737         list_del_init(&blkiop->list);
1738
1739         spin_unlock(&blkio_list_lock);
1740         blkcg_bypass_end();
1741 }
1742 EXPORT_SYMBOL_GPL(blkio_policy_unregister);