]> Git Repo - linux.git/blob - net/sched/sch_api.c
fs: Allow listmount() in foreign mount namespace
[linux.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <[email protected]>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <[email protected]> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <[email protected]> :990222: kmod support
11  * Jamal Hadi Salim <[email protected]>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module(NET_SCH_ALIAS_PREFIX "%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332                                           handle);
333 out:
334         return q;
335 }
336
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339         unsigned long cl;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345
346         if (cl == 0)
347                 return NULL;
348         return cops->leaf(p, cl);
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 ||
419             r->cell_log == 0 || r->cell_log >= 32 ||
420             nla_len(tab) != TC_RTAB_SIZE) {
421                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422                 return NULL;
423         }
424
425         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
428                         rtab->refcnt++;
429                         return rtab;
430                 }
431         }
432
433         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434         if (rtab) {
435                 rtab->rate = *r;
436                 rtab->refcnt = 1;
437                 memcpy(rtab->data, nla_data(tab), 1024);
438                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
439                         r->linklayer = __detect_linklayer(r, rtab->data);
440                 rtab->next = qdisc_rtab_list;
441                 qdisc_rtab_list = rtab;
442         } else {
443                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444         }
445         return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451         struct qdisc_rate_table *rtab, **rtabp;
452
453         if (!tab || --tab->refcnt)
454                 return;
455
456         for (rtabp = &qdisc_rtab_list;
457              (rtab = *rtabp) != NULL;
458              rtabp = &rtab->next) {
459                 if (rtab == tab) {
460                         *rtabp = rtab->next;
461                         kfree(rtab);
462                         return;
463                 }
464         }
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467
468 static LIST_HEAD(qdisc_stab_list);
469
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
472         [TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476                                                struct netlink_ext_ack *extack)
477 {
478         struct nlattr *tb[TCA_STAB_MAX + 1];
479         struct qdisc_size_table *stab;
480         struct tc_sizespec *s;
481         unsigned int tsize = 0;
482         u16 *tab = NULL;
483         int err;
484
485         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486                                           extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493
494         s = nla_data(tb[TCA_STAB_BASE]);
495
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 &&
514                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515                         continue;
516                 stab->refcnt++;
517                 return stab;
518         }
519
520         if (s->size_log > STAB_SIZE_LOG_MAX ||
521             s->cell_log > STAB_SIZE_LOG_MAX) {
522                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523                 return ERR_PTR(-EINVAL);
524         }
525
526         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527         if (!stab)
528                 return ERR_PTR(-ENOMEM);
529
530         stab->refcnt = 1;
531         stab->szopts = *s;
532         if (tsize > 0)
533                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534
535         list_add_tail(&stab->list, &qdisc_stab_list);
536
537         return stab;
538 }
539
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542         if (!tab)
543                 return;
544
545         if (--tab->refcnt == 0) {
546                 list_del(&tab->list);
547                 kfree_rcu(tab, rcu);
548         }
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554         struct nlattr *nest;
555
556         nest = nla_nest_start_noflag(skb, TCA_STAB);
557         if (nest == NULL)
558                 goto nla_put_failure;
559         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560                 goto nla_put_failure;
561         nla_nest_end(skb, nest);
562
563         return skb->len;
564
565 nla_put_failure:
566         return -1;
567 }
568
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570                                const struct qdisc_size_table *stab)
571 {
572         int pkt_len, slot;
573
574         pkt_len = skb->len + stab->szopts.overhead;
575         if (unlikely(!stab->szopts.tsize))
576                 goto out;
577
578         slot = pkt_len + stab->szopts.cell_align;
579         if (unlikely(slot < 0))
580                 slot = 0;
581
582         slot >>= stab->szopts.cell_log;
583         if (likely(slot < stab->szopts.tsize))
584                 pkt_len = stab->data[slot];
585         else
586                 pkt_len = stab->data[stab->szopts.tsize - 1] *
587                                 (slot / stab->szopts.tsize) +
588                                 stab->data[slot % stab->szopts.tsize];
589
590         pkt_len <<= stab->szopts.size_log;
591 out:
592         if (unlikely(pkt_len < 1))
593                 pkt_len = 1;
594         qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
597
598 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
599 {
600         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
601                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
602                         txt, qdisc->ops->id, qdisc->handle >> 16);
603                 qdisc->flags |= TCQ_F_WARN_NONWC;
604         }
605 }
606 EXPORT_SYMBOL(qdisc_warn_nonwc);
607
608 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
609 {
610         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
611                                                  timer);
612
613         rcu_read_lock();
614         __netif_schedule(qdisc_root(wd->qdisc));
615         rcu_read_unlock();
616
617         return HRTIMER_NORESTART;
618 }
619
620 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
621                                  clockid_t clockid)
622 {
623         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
624         wd->timer.function = qdisc_watchdog;
625         wd->qdisc = qdisc;
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
628
629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
630 {
631         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
632 }
633 EXPORT_SYMBOL(qdisc_watchdog_init);
634
635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
636                                       u64 delta_ns)
637 {
638         bool deactivated;
639
640         rcu_read_lock();
641         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
642                                &qdisc_root_sleeping(wd->qdisc)->state);
643         rcu_read_unlock();
644         if (deactivated)
645                 return;
646
647         if (hrtimer_is_queued(&wd->timer)) {
648                 u64 softexpires;
649
650                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
651                 /* If timer is already set in [expires, expires + delta_ns],
652                  * do not reprogram it.
653                  */
654                 if (softexpires - expires <= delta_ns)
655                         return;
656         }
657
658         hrtimer_start_range_ns(&wd->timer,
659                                ns_to_ktime(expires),
660                                delta_ns,
661                                HRTIMER_MODE_ABS_PINNED);
662 }
663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
664
665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
666 {
667         hrtimer_cancel(&wd->timer);
668 }
669 EXPORT_SYMBOL(qdisc_watchdog_cancel);
670
671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
672 {
673         struct hlist_head *h;
674         unsigned int i;
675
676         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
677
678         if (h != NULL) {
679                 for (i = 0; i < n; i++)
680                         INIT_HLIST_HEAD(&h[i]);
681         }
682         return h;
683 }
684
685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
686 {
687         struct Qdisc_class_common *cl;
688         struct hlist_node *next;
689         struct hlist_head *nhash, *ohash;
690         unsigned int nsize, nmask, osize;
691         unsigned int i, h;
692
693         /* Rehash when load factor exceeds 0.75 */
694         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
695                 return;
696         nsize = clhash->hashsize * 2;
697         nmask = nsize - 1;
698         nhash = qdisc_class_hash_alloc(nsize);
699         if (nhash == NULL)
700                 return;
701
702         ohash = clhash->hash;
703         osize = clhash->hashsize;
704
705         sch_tree_lock(sch);
706         for (i = 0; i < osize; i++) {
707                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
708                         h = qdisc_class_hash(cl->classid, nmask);
709                         hlist_add_head(&cl->hnode, &nhash[h]);
710                 }
711         }
712         clhash->hash     = nhash;
713         clhash->hashsize = nsize;
714         clhash->hashmask = nmask;
715         sch_tree_unlock(sch);
716
717         kvfree(ohash);
718 }
719 EXPORT_SYMBOL(qdisc_class_hash_grow);
720
721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
722 {
723         unsigned int size = 4;
724
725         clhash->hash = qdisc_class_hash_alloc(size);
726         if (!clhash->hash)
727                 return -ENOMEM;
728         clhash->hashsize  = size;
729         clhash->hashmask  = size - 1;
730         clhash->hashelems = 0;
731         return 0;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_init);
734
735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
736 {
737         kvfree(clhash->hash);
738 }
739 EXPORT_SYMBOL(qdisc_class_hash_destroy);
740
741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
742                              struct Qdisc_class_common *cl)
743 {
744         unsigned int h;
745
746         INIT_HLIST_NODE(&cl->hnode);
747         h = qdisc_class_hash(cl->classid, clhash->hashmask);
748         hlist_add_head(&cl->hnode, &clhash->hash[h]);
749         clhash->hashelems++;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_insert);
752
753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
754                              struct Qdisc_class_common *cl)
755 {
756         hlist_del(&cl->hnode);
757         clhash->hashelems--;
758 }
759 EXPORT_SYMBOL(qdisc_class_hash_remove);
760
761 /* Allocate an unique handle from space managed by kernel
762  * Possible range is [8000-FFFF]:0000 (0x8000 values)
763  */
764 static u32 qdisc_alloc_handle(struct net_device *dev)
765 {
766         int i = 0x8000;
767         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
768
769         do {
770                 autohandle += TC_H_MAKE(0x10000U, 0);
771                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
772                         autohandle = TC_H_MAKE(0x80000000U, 0);
773                 if (!qdisc_lookup(dev, autohandle))
774                         return autohandle;
775                 cond_resched();
776         } while (--i > 0);
777
778         return 0;
779 }
780
781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
782 {
783         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
784         const struct Qdisc_class_ops *cops;
785         unsigned long cl;
786         u32 parentid;
787         bool notify;
788         int drops;
789
790         if (n == 0 && len == 0)
791                 return;
792         drops = max_t(int, n, 0);
793         rcu_read_lock();
794         while ((parentid = sch->parent)) {
795                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
796                         break;
797
798                 if (sch->flags & TCQ_F_NOPARENT)
799                         break;
800                 /* Notify parent qdisc only if child qdisc becomes empty.
801                  *
802                  * If child was empty even before update then backlog
803                  * counter is screwed and we skip notification because
804                  * parent class is already passive.
805                  *
806                  * If the original child was offloaded then it is allowed
807                  * to be seem as empty, so the parent is notified anyway.
808                  */
809                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
810                                                        !qdisc_is_offloaded);
811                 /* TODO: perform the search on a per txq basis */
812                 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
813                 if (sch == NULL) {
814                         WARN_ON_ONCE(parentid != TC_H_ROOT);
815                         break;
816                 }
817                 cops = sch->ops->cl_ops;
818                 if (notify && cops->qlen_notify) {
819                         cl = cops->find(sch, parentid);
820                         cops->qlen_notify(sch, cl);
821                 }
822                 sch->q.qlen -= n;
823                 sch->qstats.backlog -= len;
824                 __qdisc_qstats_drop(sch, drops);
825         }
826         rcu_read_unlock();
827 }
828 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
829
830 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
831                               void *type_data)
832 {
833         struct net_device *dev = qdisc_dev(sch);
834         int err;
835
836         sch->flags &= ~TCQ_F_OFFLOADED;
837         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
838                 return 0;
839
840         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
841         if (err == -EOPNOTSUPP)
842                 return 0;
843
844         if (!err)
845                 sch->flags |= TCQ_F_OFFLOADED;
846
847         return err;
848 }
849 EXPORT_SYMBOL(qdisc_offload_dump_helper);
850
851 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
852                                 struct Qdisc *new, struct Qdisc *old,
853                                 enum tc_setup_type type, void *type_data,
854                                 struct netlink_ext_ack *extack)
855 {
856         bool any_qdisc_is_offloaded;
857         int err;
858
859         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
860                 return;
861
862         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
863
864         /* Don't report error if the graft is part of destroy operation. */
865         if (!err || !new || new == &noop_qdisc)
866                 return;
867
868         /* Don't report error if the parent, the old child and the new
869          * one are not offloaded.
870          */
871         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
872         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
873         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
874
875         if (any_qdisc_is_offloaded)
876                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
877 }
878 EXPORT_SYMBOL(qdisc_offload_graft_helper);
879
880 void qdisc_offload_query_caps(struct net_device *dev,
881                               enum tc_setup_type type,
882                               void *caps, size_t caps_len)
883 {
884         const struct net_device_ops *ops = dev->netdev_ops;
885         struct tc_query_caps_base base = {
886                 .type = type,
887                 .caps = caps,
888         };
889
890         memset(caps, 0, caps_len);
891
892         if (ops->ndo_setup_tc)
893                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
894 }
895 EXPORT_SYMBOL(qdisc_offload_query_caps);
896
897 static void qdisc_offload_graft_root(struct net_device *dev,
898                                      struct Qdisc *new, struct Qdisc *old,
899                                      struct netlink_ext_ack *extack)
900 {
901         struct tc_root_qopt_offload graft_offload = {
902                 .command        = TC_ROOT_GRAFT,
903                 .handle         = new ? new->handle : 0,
904                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
905                                   (old && old->flags & TCQ_F_INGRESS),
906         };
907
908         qdisc_offload_graft_helper(dev, NULL, new, old,
909                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
910 }
911
912 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
913                          u32 portid, u32 seq, u16 flags, int event,
914                          struct netlink_ext_ack *extack)
915 {
916         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
917         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
918         struct tcmsg *tcm;
919         struct nlmsghdr  *nlh;
920         unsigned char *b = skb_tail_pointer(skb);
921         struct gnet_dump d;
922         struct qdisc_size_table *stab;
923         u32 block_index;
924         __u32 qlen;
925
926         cond_resched();
927         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
928         if (!nlh)
929                 goto out_nlmsg_trim;
930         tcm = nlmsg_data(nlh);
931         tcm->tcm_family = AF_UNSPEC;
932         tcm->tcm__pad1 = 0;
933         tcm->tcm__pad2 = 0;
934         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
935         tcm->tcm_parent = clid;
936         tcm->tcm_handle = q->handle;
937         tcm->tcm_info = refcount_read(&q->refcnt);
938         if (nla_put_string(skb, TCA_KIND, q->ops->id))
939                 goto nla_put_failure;
940         if (q->ops->ingress_block_get) {
941                 block_index = q->ops->ingress_block_get(q);
942                 if (block_index &&
943                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
944                         goto nla_put_failure;
945         }
946         if (q->ops->egress_block_get) {
947                 block_index = q->ops->egress_block_get(q);
948                 if (block_index &&
949                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
950                         goto nla_put_failure;
951         }
952         if (q->ops->dump && q->ops->dump(q, skb) < 0)
953                 goto nla_put_failure;
954         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
955                 goto nla_put_failure;
956         qlen = qdisc_qlen_sum(q);
957
958         stab = rtnl_dereference(q->stab);
959         if (stab && qdisc_dump_stab(skb, stab) < 0)
960                 goto nla_put_failure;
961
962         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
963                                          NULL, &d, TCA_PAD) < 0)
964                 goto nla_put_failure;
965
966         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
967                 goto nla_put_failure;
968
969         if (qdisc_is_percpu_stats(q)) {
970                 cpu_bstats = q->cpu_bstats;
971                 cpu_qstats = q->cpu_qstats;
972         }
973
974         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
975             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
976             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
977                 goto nla_put_failure;
978
979         if (gnet_stats_finish_copy(&d) < 0)
980                 goto nla_put_failure;
981
982         if (extack && extack->_msg &&
983             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
984                 goto out_nlmsg_trim;
985
986         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
987
988         return skb->len;
989
990 out_nlmsg_trim:
991 nla_put_failure:
992         nlmsg_trim(skb, b);
993         return -1;
994 }
995
996 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
997 {
998         if (q->flags & TCQ_F_BUILTIN)
999                 return true;
1000         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1001                 return true;
1002
1003         return false;
1004 }
1005
1006 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
1007                             struct nlmsghdr *n, u32 clid, struct Qdisc *q,
1008                             struct netlink_ext_ack *extack)
1009 {
1010         struct sk_buff *skb;
1011         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1012
1013         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1014         if (!skb)
1015                 return -ENOBUFS;
1016
1017         if (!tc_qdisc_dump_ignore(q, false)) {
1018                 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
1019                                   RTM_NEWQDISC, extack) < 0)
1020                         goto err_out;
1021         }
1022
1023         if (skb->len)
1024                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1025                                       n->nlmsg_flags & NLM_F_ECHO);
1026
1027 err_out:
1028         kfree_skb(skb);
1029         return -EINVAL;
1030 }
1031
1032 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1033                         struct nlmsghdr *n, u32 clid,
1034                         struct Qdisc *old, struct Qdisc *new,
1035                         struct netlink_ext_ack *extack)
1036 {
1037         struct sk_buff *skb;
1038         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1039
1040         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1041                 return 0;
1042
1043         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1044         if (!skb)
1045                 return -ENOBUFS;
1046
1047         if (old && !tc_qdisc_dump_ignore(old, false)) {
1048                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1049                                   0, RTM_DELQDISC, extack) < 0)
1050                         goto err_out;
1051         }
1052         if (new && !tc_qdisc_dump_ignore(new, false)) {
1053                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1054                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1055                         goto err_out;
1056         }
1057
1058         if (skb->len)
1059                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1060                                       n->nlmsg_flags & NLM_F_ECHO);
1061
1062 err_out:
1063         kfree_skb(skb);
1064         return -EINVAL;
1065 }
1066
1067 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1068                                struct nlmsghdr *n, u32 clid,
1069                                struct Qdisc *old, struct Qdisc *new,
1070                                struct netlink_ext_ack *extack)
1071 {
1072         if (new || old)
1073                 qdisc_notify(net, skb, n, clid, old, new, extack);
1074
1075         if (old)
1076                 qdisc_put(old);
1077 }
1078
1079 static void qdisc_clear_nolock(struct Qdisc *sch)
1080 {
1081         sch->flags &= ~TCQ_F_NOLOCK;
1082         if (!(sch->flags & TCQ_F_CPUSTATS))
1083                 return;
1084
1085         free_percpu(sch->cpu_bstats);
1086         free_percpu(sch->cpu_qstats);
1087         sch->cpu_bstats = NULL;
1088         sch->cpu_qstats = NULL;
1089         sch->flags &= ~TCQ_F_CPUSTATS;
1090 }
1091
1092 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1093  * to device "dev".
1094  *
1095  * When appropriate send a netlink notification using 'skb'
1096  * and "n".
1097  *
1098  * On success, destroy old qdisc.
1099  */
1100
1101 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1102                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1103                        struct Qdisc *new, struct Qdisc *old,
1104                        struct netlink_ext_ack *extack)
1105 {
1106         struct Qdisc *q = old;
1107         struct net *net = dev_net(dev);
1108
1109         if (parent == NULL) {
1110                 unsigned int i, num_q, ingress;
1111                 struct netdev_queue *dev_queue;
1112
1113                 ingress = 0;
1114                 num_q = dev->num_tx_queues;
1115                 if ((q && q->flags & TCQ_F_INGRESS) ||
1116                     (new && new->flags & TCQ_F_INGRESS)) {
1117                         ingress = 1;
1118                         dev_queue = dev_ingress_queue(dev);
1119                         if (!dev_queue) {
1120                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1121                                 return -ENOENT;
1122                         }
1123
1124                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1125
1126                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1127                          * __tcf_qdisc_find() for filter requests.
1128                          */
1129                         if (!qdisc_refcount_dec_if_one(q)) {
1130                                 NL_SET_ERR_MSG(extack,
1131                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1132                                 return -EBUSY;
1133                         }
1134                 }
1135
1136                 if (dev->flags & IFF_UP)
1137                         dev_deactivate(dev);
1138
1139                 qdisc_offload_graft_root(dev, new, old, extack);
1140
1141                 if (new && new->ops->attach && !ingress)
1142                         goto skip;
1143
1144                 if (!ingress) {
1145                         for (i = 0; i < num_q; i++) {
1146                                 dev_queue = netdev_get_tx_queue(dev, i);
1147                                 old = dev_graft_qdisc(dev_queue, new);
1148
1149                                 if (new && i > 0)
1150                                         qdisc_refcount_inc(new);
1151                                 qdisc_put(old);
1152                         }
1153                 } else {
1154                         old = dev_graft_qdisc(dev_queue, NULL);
1155
1156                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1157                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1158                          * pointer(s) in mini_qdisc_pair_swap().
1159                          */
1160                         qdisc_notify(net, skb, n, classid, old, new, extack);
1161                         qdisc_destroy(old);
1162
1163                         dev_graft_qdisc(dev_queue, new);
1164                 }
1165
1166 skip:
1167                 if (!ingress) {
1168                         old = rtnl_dereference(dev->qdisc);
1169                         if (new && !new->ops->attach)
1170                                 qdisc_refcount_inc(new);
1171                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1172
1173                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1174
1175                         if (new && new->ops->attach)
1176                                 new->ops->attach(new);
1177                 }
1178
1179                 if (dev->flags & IFF_UP)
1180                         dev_activate(dev);
1181         } else {
1182                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1183                 unsigned long cl;
1184                 int err;
1185
1186                 /* Only support running class lockless if parent is lockless */
1187                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1188                         qdisc_clear_nolock(new);
1189
1190                 if (!cops || !cops->graft)
1191                         return -EOPNOTSUPP;
1192
1193                 cl = cops->find(parent, classid);
1194                 if (!cl) {
1195                         NL_SET_ERR_MSG(extack, "Specified class not found");
1196                         return -ENOENT;
1197                 }
1198
1199                 if (new && new->ops == &noqueue_qdisc_ops) {
1200                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1201                         return -EINVAL;
1202                 }
1203
1204                 err = cops->graft(parent, cl, new, &old, extack);
1205                 if (err)
1206                         return err;
1207                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1208         }
1209         return 0;
1210 }
1211
1212 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1213                                    struct netlink_ext_ack *extack)
1214 {
1215         u32 block_index;
1216
1217         if (tca[TCA_INGRESS_BLOCK]) {
1218                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1219
1220                 if (!block_index) {
1221                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1222                         return -EINVAL;
1223                 }
1224                 if (!sch->ops->ingress_block_set) {
1225                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1226                         return -EOPNOTSUPP;
1227                 }
1228                 sch->ops->ingress_block_set(sch, block_index);
1229         }
1230         if (tca[TCA_EGRESS_BLOCK]) {
1231                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1232
1233                 if (!block_index) {
1234                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1235                         return -EINVAL;
1236                 }
1237                 if (!sch->ops->egress_block_set) {
1238                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1239                         return -EOPNOTSUPP;
1240                 }
1241                 sch->ops->egress_block_set(sch, block_index);
1242         }
1243         return 0;
1244 }
1245
1246 /*
1247    Allocate and initialize new qdisc.
1248
1249    Parameters are passed via opt.
1250  */
1251
1252 static struct Qdisc *qdisc_create(struct net_device *dev,
1253                                   struct netdev_queue *dev_queue,
1254                                   u32 parent, u32 handle,
1255                                   struct nlattr **tca, int *errp,
1256                                   struct netlink_ext_ack *extack)
1257 {
1258         int err;
1259         struct nlattr *kind = tca[TCA_KIND];
1260         struct Qdisc *sch;
1261         struct Qdisc_ops *ops;
1262         struct qdisc_size_table *stab;
1263
1264         ops = qdisc_lookup_ops(kind);
1265 #ifdef CONFIG_MODULES
1266         if (ops == NULL && kind != NULL) {
1267                 char name[IFNAMSIZ];
1268                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1269                         /* We dropped the RTNL semaphore in order to
1270                          * perform the module load.  So, even if we
1271                          * succeeded in loading the module we have to
1272                          * tell the caller to replay the request.  We
1273                          * indicate this using -EAGAIN.
1274                          * We replay the request because the device may
1275                          * go away in the mean time.
1276                          */
1277                         rtnl_unlock();
1278                         request_module(NET_SCH_ALIAS_PREFIX "%s", name);
1279                         rtnl_lock();
1280                         ops = qdisc_lookup_ops(kind);
1281                         if (ops != NULL) {
1282                                 /* We will try again qdisc_lookup_ops,
1283                                  * so don't keep a reference.
1284                                  */
1285                                 module_put(ops->owner);
1286                                 err = -EAGAIN;
1287                                 goto err_out;
1288                         }
1289                 }
1290         }
1291 #endif
1292
1293         err = -ENOENT;
1294         if (!ops) {
1295                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1296                 goto err_out;
1297         }
1298
1299         sch = qdisc_alloc(dev_queue, ops, extack);
1300         if (IS_ERR(sch)) {
1301                 err = PTR_ERR(sch);
1302                 goto err_out2;
1303         }
1304
1305         sch->parent = parent;
1306
1307         if (handle == TC_H_INGRESS) {
1308                 if (!(sch->flags & TCQ_F_INGRESS)) {
1309                         NL_SET_ERR_MSG(extack,
1310                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1311                         err = -EINVAL;
1312                         goto err_out3;
1313                 }
1314                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1315         } else {
1316                 if (handle == 0) {
1317                         handle = qdisc_alloc_handle(dev);
1318                         if (handle == 0) {
1319                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1320                                 err = -ENOSPC;
1321                                 goto err_out3;
1322                         }
1323                 }
1324                 if (!netif_is_multiqueue(dev))
1325                         sch->flags |= TCQ_F_ONETXQUEUE;
1326         }
1327
1328         sch->handle = handle;
1329
1330         /* This exist to keep backward compatible with a userspace
1331          * loophole, what allowed userspace to get IFF_NO_QUEUE
1332          * facility on older kernels by setting tx_queue_len=0 (prior
1333          * to qdisc init), and then forgot to reinit tx_queue_len
1334          * before again attaching a qdisc.
1335          */
1336         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1337                 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN);
1338                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1339         }
1340
1341         err = qdisc_block_indexes_set(sch, tca, extack);
1342         if (err)
1343                 goto err_out3;
1344
1345         if (tca[TCA_STAB]) {
1346                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1347                 if (IS_ERR(stab)) {
1348                         err = PTR_ERR(stab);
1349                         goto err_out3;
1350                 }
1351                 rcu_assign_pointer(sch->stab, stab);
1352         }
1353
1354         if (ops->init) {
1355                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1356                 if (err != 0)
1357                         goto err_out4;
1358         }
1359
1360         if (tca[TCA_RATE]) {
1361                 err = -EOPNOTSUPP;
1362                 if (sch->flags & TCQ_F_MQROOT) {
1363                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1364                         goto err_out4;
1365                 }
1366
1367                 err = gen_new_estimator(&sch->bstats,
1368                                         sch->cpu_bstats,
1369                                         &sch->rate_est,
1370                                         NULL,
1371                                         true,
1372                                         tca[TCA_RATE]);
1373                 if (err) {
1374                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1375                         goto err_out4;
1376                 }
1377         }
1378
1379         qdisc_hash_add(sch, false);
1380         trace_qdisc_create(ops, dev, parent);
1381
1382         return sch;
1383
1384 err_out4:
1385         /* Even if ops->init() failed, we call ops->destroy()
1386          * like qdisc_create_dflt().
1387          */
1388         if (ops->destroy)
1389                 ops->destroy(sch);
1390         qdisc_put_stab(rtnl_dereference(sch->stab));
1391 err_out3:
1392         lockdep_unregister_key(&sch->root_lock_key);
1393         netdev_put(dev, &sch->dev_tracker);
1394         qdisc_free(sch);
1395 err_out2:
1396         module_put(ops->owner);
1397 err_out:
1398         *errp = err;
1399         return NULL;
1400 }
1401
1402 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1403                         struct netlink_ext_ack *extack)
1404 {
1405         struct qdisc_size_table *ostab, *stab = NULL;
1406         int err = 0;
1407
1408         if (tca[TCA_OPTIONS]) {
1409                 if (!sch->ops->change) {
1410                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1411                         return -EINVAL;
1412                 }
1413                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1414                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1415                         return -EOPNOTSUPP;
1416                 }
1417                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1418                 if (err)
1419                         return err;
1420         }
1421
1422         if (tca[TCA_STAB]) {
1423                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1424                 if (IS_ERR(stab))
1425                         return PTR_ERR(stab);
1426         }
1427
1428         ostab = rtnl_dereference(sch->stab);
1429         rcu_assign_pointer(sch->stab, stab);
1430         qdisc_put_stab(ostab);
1431
1432         if (tca[TCA_RATE]) {
1433                 /* NB: ignores errors from replace_estimator
1434                    because change can't be undone. */
1435                 if (sch->flags & TCQ_F_MQROOT)
1436                         goto out;
1437                 gen_replace_estimator(&sch->bstats,
1438                                       sch->cpu_bstats,
1439                                       &sch->rate_est,
1440                                       NULL,
1441                                       true,
1442                                       tca[TCA_RATE]);
1443         }
1444 out:
1445         return 0;
1446 }
1447
1448 struct check_loop_arg {
1449         struct qdisc_walker     w;
1450         struct Qdisc            *p;
1451         int                     depth;
1452 };
1453
1454 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1455                          struct qdisc_walker *w);
1456
1457 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1458 {
1459         struct check_loop_arg   arg;
1460
1461         if (q->ops->cl_ops == NULL)
1462                 return 0;
1463
1464         arg.w.stop = arg.w.skip = arg.w.count = 0;
1465         arg.w.fn = check_loop_fn;
1466         arg.depth = depth;
1467         arg.p = p;
1468         q->ops->cl_ops->walk(q, &arg.w);
1469         return arg.w.stop ? -ELOOP : 0;
1470 }
1471
1472 static int
1473 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1474 {
1475         struct Qdisc *leaf;
1476         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1477         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1478
1479         leaf = cops->leaf(q, cl);
1480         if (leaf) {
1481                 if (leaf == arg->p || arg->depth > 7)
1482                         return -ELOOP;
1483                 return check_loop(leaf, arg->p, arg->depth + 1);
1484         }
1485         return 0;
1486 }
1487
1488 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1489         [TCA_KIND]              = { .type = NLA_STRING },
1490         [TCA_RATE]              = { .type = NLA_BINARY,
1491                                     .len = sizeof(struct tc_estimator) },
1492         [TCA_STAB]              = { .type = NLA_NESTED },
1493         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1494         [TCA_CHAIN]             = { .type = NLA_U32 },
1495         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1496         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1497 };
1498
1499 /*
1500  * Delete/get qdisc.
1501  */
1502
1503 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1504                         struct netlink_ext_ack *extack)
1505 {
1506         struct net *net = sock_net(skb->sk);
1507         struct tcmsg *tcm = nlmsg_data(n);
1508         struct nlattr *tca[TCA_MAX + 1];
1509         struct net_device *dev;
1510         u32 clid;
1511         struct Qdisc *q = NULL;
1512         struct Qdisc *p = NULL;
1513         int err;
1514
1515         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1516                                      rtm_tca_policy, extack);
1517         if (err < 0)
1518                 return err;
1519
1520         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1521         if (!dev)
1522                 return -ENODEV;
1523
1524         clid = tcm->tcm_parent;
1525         if (clid) {
1526                 if (clid != TC_H_ROOT) {
1527                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1528                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1529                                 if (!p) {
1530                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1531                                         return -ENOENT;
1532                                 }
1533                                 q = qdisc_leaf(p, clid);
1534                         } else if (dev_ingress_queue(dev)) {
1535                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1536                         }
1537                 } else {
1538                         q = rtnl_dereference(dev->qdisc);
1539                 }
1540                 if (!q) {
1541                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1542                         return -ENOENT;
1543                 }
1544
1545                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1546                         NL_SET_ERR_MSG(extack, "Invalid handle");
1547                         return -EINVAL;
1548                 }
1549         } else {
1550                 q = qdisc_lookup(dev, tcm->tcm_handle);
1551                 if (!q) {
1552                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1553                         return -ENOENT;
1554                 }
1555         }
1556
1557         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1558                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1559                 return -EINVAL;
1560         }
1561
1562         if (n->nlmsg_type == RTM_DELQDISC) {
1563                 if (!clid) {
1564                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1565                         return -EINVAL;
1566                 }
1567                 if (q->handle == 0) {
1568                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1569                         return -ENOENT;
1570                 }
1571                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1572                 if (err != 0)
1573                         return err;
1574         } else {
1575                 qdisc_get_notify(net, skb, n, clid, q, NULL);
1576         }
1577         return 0;
1578 }
1579
1580 static bool req_create_or_replace(struct nlmsghdr *n)
1581 {
1582         return (n->nlmsg_flags & NLM_F_CREATE &&
1583                 n->nlmsg_flags & NLM_F_REPLACE);
1584 }
1585
1586 static bool req_create_exclusive(struct nlmsghdr *n)
1587 {
1588         return (n->nlmsg_flags & NLM_F_CREATE &&
1589                 n->nlmsg_flags & NLM_F_EXCL);
1590 }
1591
1592 static bool req_change(struct nlmsghdr *n)
1593 {
1594         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1595                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1596                 !(n->nlmsg_flags & NLM_F_EXCL));
1597 }
1598
1599 /*
1600  * Create/change qdisc.
1601  */
1602 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1603                            struct netlink_ext_ack *extack)
1604 {
1605         struct net *net = sock_net(skb->sk);
1606         struct tcmsg *tcm;
1607         struct nlattr *tca[TCA_MAX + 1];
1608         struct net_device *dev;
1609         u32 clid;
1610         struct Qdisc *q, *p;
1611         int err;
1612
1613 replay:
1614         /* Reinit, just in case something touches this. */
1615         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1616                                      rtm_tca_policy, extack);
1617         if (err < 0)
1618                 return err;
1619
1620         tcm = nlmsg_data(n);
1621         clid = tcm->tcm_parent;
1622         q = p = NULL;
1623
1624         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1625         if (!dev)
1626                 return -ENODEV;
1627
1628
1629         if (clid) {
1630                 if (clid != TC_H_ROOT) {
1631                         if (clid != TC_H_INGRESS) {
1632                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1633                                 if (!p) {
1634                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1635                                         return -ENOENT;
1636                                 }
1637                                 q = qdisc_leaf(p, clid);
1638                         } else if (dev_ingress_queue_create(dev)) {
1639                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1640                         }
1641                 } else {
1642                         q = rtnl_dereference(dev->qdisc);
1643                 }
1644
1645                 /* It may be default qdisc, ignore it */
1646                 if (q && q->handle == 0)
1647                         q = NULL;
1648
1649                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1650                         if (tcm->tcm_handle) {
1651                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1652                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1653                                         return -EEXIST;
1654                                 }
1655                                 if (TC_H_MIN(tcm->tcm_handle)) {
1656                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1657                                         return -EINVAL;
1658                                 }
1659                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1660                                 if (!q)
1661                                         goto create_n_graft;
1662                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1663                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1664                                         return -EEXIST;
1665                                 }
1666                                 if (tca[TCA_KIND] &&
1667                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1668                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1669                                         return -EINVAL;
1670                                 }
1671                                 if (q->flags & TCQ_F_INGRESS) {
1672                                         NL_SET_ERR_MSG(extack,
1673                                                        "Cannot regraft ingress or clsact Qdiscs");
1674                                         return -EINVAL;
1675                                 }
1676                                 if (q == p ||
1677                                     (p && check_loop(q, p, 0))) {
1678                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1679                                         return -ELOOP;
1680                                 }
1681                                 if (clid == TC_H_INGRESS) {
1682                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1683                                         return -EINVAL;
1684                                 }
1685                                 qdisc_refcount_inc(q);
1686                                 goto graft;
1687                         } else {
1688                                 if (!q)
1689                                         goto create_n_graft;
1690
1691                                 /* This magic test requires explanation.
1692                                  *
1693                                  *   We know, that some child q is already
1694                                  *   attached to this parent and have choice:
1695                                  *   1) change it or 2) create/graft new one.
1696                                  *   If the requested qdisc kind is different
1697                                  *   than the existing one, then we choose graft.
1698                                  *   If they are the same then this is "change"
1699                                  *   operation - just let it fallthrough..
1700                                  *
1701                                  *   1. We are allowed to create/graft only
1702                                  *   if the request is explicitly stating
1703                                  *   "please create if it doesn't exist".
1704                                  *
1705                                  *   2. If the request is to exclusive create
1706                                  *   then the qdisc tcm_handle is not expected
1707                                  *   to exist, so that we choose create/graft too.
1708                                  *
1709                                  *   3. The last case is when no flags are set.
1710                                  *   This will happen when for example tc
1711                                  *   utility issues a "change" command.
1712                                  *   Alas, it is sort of hole in API, we
1713                                  *   cannot decide what to do unambiguously.
1714                                  *   For now we select create/graft.
1715                                  */
1716                                 if (tca[TCA_KIND] &&
1717                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1718                                         if (req_create_or_replace(n) ||
1719                                             req_create_exclusive(n))
1720                                                 goto create_n_graft;
1721                                         else if (req_change(n))
1722                                                 goto create_n_graft2;
1723                                 }
1724                         }
1725                 }
1726         } else {
1727                 if (!tcm->tcm_handle) {
1728                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1729                         return -EINVAL;
1730                 }
1731                 q = qdisc_lookup(dev, tcm->tcm_handle);
1732         }
1733
1734         /* Change qdisc parameters */
1735         if (!q) {
1736                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1737                 return -ENOENT;
1738         }
1739         if (n->nlmsg_flags & NLM_F_EXCL) {
1740                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1741                 return -EEXIST;
1742         }
1743         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1744                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1745                 return -EINVAL;
1746         }
1747         err = qdisc_change(q, tca, extack);
1748         if (err == 0)
1749                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1750         return err;
1751
1752 create_n_graft:
1753         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1754                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1755                 return -ENOENT;
1756         }
1757 create_n_graft2:
1758         if (clid == TC_H_INGRESS) {
1759                 if (dev_ingress_queue(dev)) {
1760                         q = qdisc_create(dev, dev_ingress_queue(dev),
1761                                          tcm->tcm_parent, tcm->tcm_parent,
1762                                          tca, &err, extack);
1763                 } else {
1764                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1765                         err = -ENOENT;
1766                 }
1767         } else {
1768                 struct netdev_queue *dev_queue;
1769
1770                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1771                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1772                 else if (p)
1773                         dev_queue = p->dev_queue;
1774                 else
1775                         dev_queue = netdev_get_tx_queue(dev, 0);
1776
1777                 q = qdisc_create(dev, dev_queue,
1778                                  tcm->tcm_parent, tcm->tcm_handle,
1779                                  tca, &err, extack);
1780         }
1781         if (q == NULL) {
1782                 if (err == -EAGAIN)
1783                         goto replay;
1784                 return err;
1785         }
1786
1787 graft:
1788         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1789         if (err) {
1790                 if (q)
1791                         qdisc_put(q);
1792                 return err;
1793         }
1794
1795         return 0;
1796 }
1797
1798 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1799                               struct netlink_callback *cb,
1800                               int *q_idx_p, int s_q_idx, bool recur,
1801                               bool dump_invisible)
1802 {
1803         int ret = 0, q_idx = *q_idx_p;
1804         struct Qdisc *q;
1805         int b;
1806
1807         if (!root)
1808                 return 0;
1809
1810         q = root;
1811         if (q_idx < s_q_idx) {
1812                 q_idx++;
1813         } else {
1814                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1815                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1816                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1817                                   RTM_NEWQDISC, NULL) <= 0)
1818                         goto done;
1819                 q_idx++;
1820         }
1821
1822         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1823          * itself has already been dumped.
1824          *
1825          * If we've already dumped the top-level (ingress) qdisc above and the global
1826          * qdisc hashtable, we don't want to hit it again
1827          */
1828         if (!qdisc_dev(root) || !recur)
1829                 goto out;
1830
1831         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1832                 if (q_idx < s_q_idx) {
1833                         q_idx++;
1834                         continue;
1835                 }
1836                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1837                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1838                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1839                                   RTM_NEWQDISC, NULL) <= 0)
1840                         goto done;
1841                 q_idx++;
1842         }
1843
1844 out:
1845         *q_idx_p = q_idx;
1846         return ret;
1847 done:
1848         ret = -1;
1849         goto out;
1850 }
1851
1852 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1853 {
1854         struct net *net = sock_net(skb->sk);
1855         int idx, q_idx;
1856         int s_idx, s_q_idx;
1857         struct net_device *dev;
1858         const struct nlmsghdr *nlh = cb->nlh;
1859         struct nlattr *tca[TCA_MAX + 1];
1860         int err;
1861
1862         s_idx = cb->args[0];
1863         s_q_idx = q_idx = cb->args[1];
1864
1865         idx = 0;
1866         ASSERT_RTNL();
1867
1868         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1869                                      rtm_tca_policy, cb->extack);
1870         if (err < 0)
1871                 return err;
1872
1873         for_each_netdev(net, dev) {
1874                 struct netdev_queue *dev_queue;
1875
1876                 if (idx < s_idx)
1877                         goto cont;
1878                 if (idx > s_idx)
1879                         s_q_idx = 0;
1880                 q_idx = 0;
1881
1882                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1883                                        skb, cb, &q_idx, s_q_idx,
1884                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1885                         goto done;
1886
1887                 dev_queue = dev_ingress_queue(dev);
1888                 if (dev_queue &&
1889                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1890                                        skb, cb, &q_idx, s_q_idx, false,
1891                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1892                         goto done;
1893
1894 cont:
1895                 idx++;
1896         }
1897
1898 done:
1899         cb->args[0] = idx;
1900         cb->args[1] = q_idx;
1901
1902         return skb->len;
1903 }
1904
1905
1906
1907 /************************************************
1908  *      Traffic classes manipulation.           *
1909  ************************************************/
1910
1911 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1912                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1913                           int event, struct netlink_ext_ack *extack)
1914 {
1915         struct tcmsg *tcm;
1916         struct nlmsghdr  *nlh;
1917         unsigned char *b = skb_tail_pointer(skb);
1918         struct gnet_dump d;
1919         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1920
1921         cond_resched();
1922         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1923         if (!nlh)
1924                 goto out_nlmsg_trim;
1925         tcm = nlmsg_data(nlh);
1926         tcm->tcm_family = AF_UNSPEC;
1927         tcm->tcm__pad1 = 0;
1928         tcm->tcm__pad2 = 0;
1929         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1930         tcm->tcm_parent = q->handle;
1931         tcm->tcm_handle = q->handle;
1932         tcm->tcm_info = 0;
1933         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1934                 goto nla_put_failure;
1935         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1936                 goto nla_put_failure;
1937
1938         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1939                                          NULL, &d, TCA_PAD) < 0)
1940                 goto nla_put_failure;
1941
1942         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1943                 goto nla_put_failure;
1944
1945         if (gnet_stats_finish_copy(&d) < 0)
1946                 goto nla_put_failure;
1947
1948         if (extack && extack->_msg &&
1949             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1950                 goto out_nlmsg_trim;
1951
1952         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1953
1954         return skb->len;
1955
1956 out_nlmsg_trim:
1957 nla_put_failure:
1958         nlmsg_trim(skb, b);
1959         return -1;
1960 }
1961
1962 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1963                          struct nlmsghdr *n, struct Qdisc *q,
1964                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1965 {
1966         struct sk_buff *skb;
1967         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1968
1969         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1970                 return 0;
1971
1972         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1973         if (!skb)
1974                 return -ENOBUFS;
1975
1976         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1977                 kfree_skb(skb);
1978                 return -EINVAL;
1979         }
1980
1981         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1982                               n->nlmsg_flags & NLM_F_ECHO);
1983 }
1984
1985 static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
1986                              struct nlmsghdr *n, struct Qdisc *q,
1987                              unsigned long cl, struct netlink_ext_ack *extack)
1988 {
1989         struct sk_buff *skb;
1990         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1991
1992         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1993         if (!skb)
1994                 return -ENOBUFS;
1995
1996         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
1997                            extack) < 0) {
1998                 kfree_skb(skb);
1999                 return -EINVAL;
2000         }
2001
2002         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
2003                               n->nlmsg_flags & NLM_F_ECHO);
2004 }
2005
2006 static int tclass_del_notify(struct net *net,
2007                              const struct Qdisc_class_ops *cops,
2008                              struct sk_buff *oskb, struct nlmsghdr *n,
2009                              struct Qdisc *q, unsigned long cl,
2010                              struct netlink_ext_ack *extack)
2011 {
2012         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
2013         struct sk_buff *skb;
2014         int err = 0;
2015
2016         if (!cops->delete)
2017                 return -EOPNOTSUPP;
2018
2019         if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
2020                 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2021                 if (!skb)
2022                         return -ENOBUFS;
2023
2024                 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
2025                                    RTM_DELTCLASS, extack) < 0) {
2026                         kfree_skb(skb);
2027                         return -EINVAL;
2028                 }
2029         } else {
2030                 skb = NULL;
2031         }
2032
2033         err = cops->delete(q, cl, extack);
2034         if (err) {
2035                 kfree_skb(skb);
2036                 return err;
2037         }
2038
2039         err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
2040                                    n->nlmsg_flags & NLM_F_ECHO);
2041         return err;
2042 }
2043
2044 #ifdef CONFIG_NET_CLS
2045
2046 struct tcf_bind_args {
2047         struct tcf_walker w;
2048         unsigned long base;
2049         unsigned long cl;
2050         u32 classid;
2051 };
2052
2053 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2054 {
2055         struct tcf_bind_args *a = (void *)arg;
2056
2057         if (n && tp->ops->bind_class) {
2058                 struct Qdisc *q = tcf_block_q(tp->chain->block);
2059
2060                 sch_tree_lock(q);
2061                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2062                 sch_tree_unlock(q);
2063         }
2064         return 0;
2065 }
2066
2067 struct tc_bind_class_args {
2068         struct qdisc_walker w;
2069         unsigned long new_cl;
2070         u32 portid;
2071         u32 clid;
2072 };
2073
2074 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2075                                 struct qdisc_walker *w)
2076 {
2077         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2078         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2079         struct tcf_block *block;
2080         struct tcf_chain *chain;
2081
2082         block = cops->tcf_block(q, cl, NULL);
2083         if (!block)
2084                 return 0;
2085         for (chain = tcf_get_next_chain(block, NULL);
2086              chain;
2087              chain = tcf_get_next_chain(block, chain)) {
2088                 struct tcf_proto *tp;
2089
2090                 for (tp = tcf_get_next_proto(chain, NULL);
2091                      tp; tp = tcf_get_next_proto(chain, tp)) {
2092                         struct tcf_bind_args arg = {};
2093
2094                         arg.w.fn = tcf_node_bind;
2095                         arg.classid = a->clid;
2096                         arg.base = cl;
2097                         arg.cl = a->new_cl;
2098                         tp->ops->walk(tp, &arg.w, true);
2099                 }
2100         }
2101
2102         return 0;
2103 }
2104
2105 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2106                            unsigned long new_cl)
2107 {
2108         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2109         struct tc_bind_class_args args = {};
2110
2111         if (!cops->tcf_block)
2112                 return;
2113         args.portid = portid;
2114         args.clid = clid;
2115         args.new_cl = new_cl;
2116         args.w.fn = tc_bind_class_walker;
2117         q->ops->cl_ops->walk(q, &args.w);
2118 }
2119
2120 #else
2121
2122 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2123                            unsigned long new_cl)
2124 {
2125 }
2126
2127 #endif
2128
2129 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2130                          struct netlink_ext_ack *extack)
2131 {
2132         struct net *net = sock_net(skb->sk);
2133         struct tcmsg *tcm = nlmsg_data(n);
2134         struct nlattr *tca[TCA_MAX + 1];
2135         struct net_device *dev;
2136         struct Qdisc *q = NULL;
2137         const struct Qdisc_class_ops *cops;
2138         unsigned long cl = 0;
2139         unsigned long new_cl;
2140         u32 portid;
2141         u32 clid;
2142         u32 qid;
2143         int err;
2144
2145         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2146                                      rtm_tca_policy, extack);
2147         if (err < 0)
2148                 return err;
2149
2150         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2151         if (!dev)
2152                 return -ENODEV;
2153
2154         /*
2155            parent == TC_H_UNSPEC - unspecified parent.
2156            parent == TC_H_ROOT   - class is root, which has no parent.
2157            parent == X:0         - parent is root class.
2158            parent == X:Y         - parent is a node in hierarchy.
2159            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2160
2161            handle == 0:0         - generate handle from kernel pool.
2162            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2163            handle == X:Y         - clear.
2164            handle == X:0         - root class.
2165          */
2166
2167         /* Step 1. Determine qdisc handle X:0 */
2168
2169         portid = tcm->tcm_parent;
2170         clid = tcm->tcm_handle;
2171         qid = TC_H_MAJ(clid);
2172
2173         if (portid != TC_H_ROOT) {
2174                 u32 qid1 = TC_H_MAJ(portid);
2175
2176                 if (qid && qid1) {
2177                         /* If both majors are known, they must be identical. */
2178                         if (qid != qid1)
2179                                 return -EINVAL;
2180                 } else if (qid1) {
2181                         qid = qid1;
2182                 } else if (qid == 0)
2183                         qid = rtnl_dereference(dev->qdisc)->handle;
2184
2185                 /* Now qid is genuine qdisc handle consistent
2186                  * both with parent and child.
2187                  *
2188                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2189                  */
2190                 if (portid)
2191                         portid = TC_H_MAKE(qid, portid);
2192         } else {
2193                 if (qid == 0)
2194                         qid = rtnl_dereference(dev->qdisc)->handle;
2195         }
2196
2197         /* OK. Locate qdisc */
2198         q = qdisc_lookup(dev, qid);
2199         if (!q)
2200                 return -ENOENT;
2201
2202         /* An check that it supports classes */
2203         cops = q->ops->cl_ops;
2204         if (cops == NULL)
2205                 return -EINVAL;
2206
2207         /* Now try to get class */
2208         if (clid == 0) {
2209                 if (portid == TC_H_ROOT)
2210                         clid = qid;
2211         } else
2212                 clid = TC_H_MAKE(qid, clid);
2213
2214         if (clid)
2215                 cl = cops->find(q, clid);
2216
2217         if (cl == 0) {
2218                 err = -ENOENT;
2219                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2220                     !(n->nlmsg_flags & NLM_F_CREATE))
2221                         goto out;
2222         } else {
2223                 switch (n->nlmsg_type) {
2224                 case RTM_NEWTCLASS:
2225                         err = -EEXIST;
2226                         if (n->nlmsg_flags & NLM_F_EXCL)
2227                                 goto out;
2228                         break;
2229                 case RTM_DELTCLASS:
2230                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2231                         /* Unbind the class with flilters with 0 */
2232                         tc_bind_tclass(q, portid, clid, 0);
2233                         goto out;
2234                 case RTM_GETTCLASS:
2235                         err = tclass_get_notify(net, skb, n, q, cl, extack);
2236                         goto out;
2237                 default:
2238                         err = -EINVAL;
2239                         goto out;
2240                 }
2241         }
2242
2243         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2244                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2245                 return -EOPNOTSUPP;
2246         }
2247
2248         new_cl = cl;
2249         err = -EOPNOTSUPP;
2250         if (cops->change)
2251                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2252         if (err == 0) {
2253                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2254                 /* We just create a new class, need to do reverse binding. */
2255                 if (cl != new_cl)
2256                         tc_bind_tclass(q, portid, clid, new_cl);
2257         }
2258 out:
2259         return err;
2260 }
2261
2262 struct qdisc_dump_args {
2263         struct qdisc_walker     w;
2264         struct sk_buff          *skb;
2265         struct netlink_callback *cb;
2266 };
2267
2268 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2269                             struct qdisc_walker *arg)
2270 {
2271         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2272
2273         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2274                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2275                               RTM_NEWTCLASS, NULL);
2276 }
2277
2278 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2279                                 struct tcmsg *tcm, struct netlink_callback *cb,
2280                                 int *t_p, int s_t)
2281 {
2282         struct qdisc_dump_args arg;
2283
2284         if (tc_qdisc_dump_ignore(q, false) ||
2285             *t_p < s_t || !q->ops->cl_ops ||
2286             (tcm->tcm_parent &&
2287              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2288                 (*t_p)++;
2289                 return 0;
2290         }
2291         if (*t_p > s_t)
2292                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2293         arg.w.fn = qdisc_class_dump;
2294         arg.skb = skb;
2295         arg.cb = cb;
2296         arg.w.stop  = 0;
2297         arg.w.skip = cb->args[1];
2298         arg.w.count = 0;
2299         q->ops->cl_ops->walk(q, &arg.w);
2300         cb->args[1] = arg.w.count;
2301         if (arg.w.stop)
2302                 return -1;
2303         (*t_p)++;
2304         return 0;
2305 }
2306
2307 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2308                                struct tcmsg *tcm, struct netlink_callback *cb,
2309                                int *t_p, int s_t, bool recur)
2310 {
2311         struct Qdisc *q;
2312         int b;
2313
2314         if (!root)
2315                 return 0;
2316
2317         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2318                 return -1;
2319
2320         if (!qdisc_dev(root) || !recur)
2321                 return 0;
2322
2323         if (tcm->tcm_parent) {
2324                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2325                 if (q && q != root &&
2326                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2327                         return -1;
2328                 return 0;
2329         }
2330         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2331                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2332                         return -1;
2333         }
2334
2335         return 0;
2336 }
2337
2338 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2339 {
2340         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2341         struct net *net = sock_net(skb->sk);
2342         struct netdev_queue *dev_queue;
2343         struct net_device *dev;
2344         int t, s_t;
2345
2346         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2347                 return 0;
2348         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2349         if (!dev)
2350                 return 0;
2351
2352         s_t = cb->args[0];
2353         t = 0;
2354
2355         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2356                                 skb, tcm, cb, &t, s_t, true) < 0)
2357                 goto done;
2358
2359         dev_queue = dev_ingress_queue(dev);
2360         if (dev_queue &&
2361             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2362                                 skb, tcm, cb, &t, s_t, false) < 0)
2363                 goto done;
2364
2365 done:
2366         cb->args[0] = t;
2367
2368         dev_put(dev);
2369         return skb->len;
2370 }
2371
2372 #ifdef CONFIG_PROC_FS
2373 static int psched_show(struct seq_file *seq, void *v)
2374 {
2375         seq_printf(seq, "%08x %08x %08x %08x\n",
2376                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2377                    1000000,
2378                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2379
2380         return 0;
2381 }
2382
2383 static int __net_init psched_net_init(struct net *net)
2384 {
2385         struct proc_dir_entry *e;
2386
2387         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2388         if (e == NULL)
2389                 return -ENOMEM;
2390
2391         return 0;
2392 }
2393
2394 static void __net_exit psched_net_exit(struct net *net)
2395 {
2396         remove_proc_entry("psched", net->proc_net);
2397 }
2398 #else
2399 static int __net_init psched_net_init(struct net *net)
2400 {
2401         return 0;
2402 }
2403
2404 static void __net_exit psched_net_exit(struct net *net)
2405 {
2406 }
2407 #endif
2408
2409 static struct pernet_operations psched_net_ops = {
2410         .init = psched_net_init,
2411         .exit = psched_net_exit,
2412 };
2413
2414 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
2415 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2416 #endif
2417
2418 static int __init pktsched_init(void)
2419 {
2420         int err;
2421
2422         err = register_pernet_subsys(&psched_net_ops);
2423         if (err) {
2424                 pr_err("pktsched_init: "
2425                        "cannot initialize per netns operations\n");
2426                 return err;
2427         }
2428
2429         register_qdisc(&pfifo_fast_ops);
2430         register_qdisc(&pfifo_qdisc_ops);
2431         register_qdisc(&bfifo_qdisc_ops);
2432         register_qdisc(&pfifo_head_drop_qdisc_ops);
2433         register_qdisc(&mq_qdisc_ops);
2434         register_qdisc(&noqueue_qdisc_ops);
2435
2436         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2437         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2438         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2439                       0);
2440         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2441         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2442         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2443                       0);
2444
2445         tc_wrapper_init();
2446
2447         return 0;
2448 }
2449
2450 subsys_initcall(pktsched_init);
This page took 0.172536 seconds and 4 git commands to generate.