]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/sched/sch_api.c
mei: move mei_cl_irq_write_complete to client.c
[karo-tx-linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
295             nla_len(tab) != TC_RTAB_SIZE)
296                 return NULL;
297
298         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
299                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
300                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
301                         rtab->refcnt++;
302                         return rtab;
303                 }
304         }
305
306         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
307         if (rtab) {
308                 rtab->rate = *r;
309                 rtab->refcnt = 1;
310                 memcpy(rtab->data, nla_data(tab), 1024);
311                 rtab->next = qdisc_rtab_list;
312                 qdisc_rtab_list = rtab;
313         }
314         return rtab;
315 }
316 EXPORT_SYMBOL(qdisc_get_rtab);
317
318 void qdisc_put_rtab(struct qdisc_rate_table *tab)
319 {
320         struct qdisc_rate_table *rtab, **rtabp;
321
322         if (!tab || --tab->refcnt)
323                 return;
324
325         for (rtabp = &qdisc_rtab_list;
326              (rtab = *rtabp) != NULL;
327              rtabp = &rtab->next) {
328                 if (rtab == tab) {
329                         *rtabp = rtab->next;
330                         kfree(rtab);
331                         return;
332                 }
333         }
334 }
335 EXPORT_SYMBOL(qdisc_put_rtab);
336
337 static LIST_HEAD(qdisc_stab_list);
338 static DEFINE_SPINLOCK(qdisc_stab_lock);
339
340 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
341         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
342         [TCA_STAB_DATA] = { .type = NLA_BINARY },
343 };
344
345 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
346 {
347         struct nlattr *tb[TCA_STAB_MAX + 1];
348         struct qdisc_size_table *stab;
349         struct tc_sizespec *s;
350         unsigned int tsize = 0;
351         u16 *tab = NULL;
352         int err;
353
354         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
355         if (err < 0)
356                 return ERR_PTR(err);
357         if (!tb[TCA_STAB_BASE])
358                 return ERR_PTR(-EINVAL);
359
360         s = nla_data(tb[TCA_STAB_BASE]);
361
362         if (s->tsize > 0) {
363                 if (!tb[TCA_STAB_DATA])
364                         return ERR_PTR(-EINVAL);
365                 tab = nla_data(tb[TCA_STAB_DATA]);
366                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
367         }
368
369         if (tsize != s->tsize || (!tab && tsize > 0))
370                 return ERR_PTR(-EINVAL);
371
372         spin_lock(&qdisc_stab_lock);
373
374         list_for_each_entry(stab, &qdisc_stab_list, list) {
375                 if (memcmp(&stab->szopts, s, sizeof(*s)))
376                         continue;
377                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
378                         continue;
379                 stab->refcnt++;
380                 spin_unlock(&qdisc_stab_lock);
381                 return stab;
382         }
383
384         spin_unlock(&qdisc_stab_lock);
385
386         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
387         if (!stab)
388                 return ERR_PTR(-ENOMEM);
389
390         stab->refcnt = 1;
391         stab->szopts = *s;
392         if (tsize > 0)
393                 memcpy(stab->data, tab, tsize * sizeof(u16));
394
395         spin_lock(&qdisc_stab_lock);
396         list_add_tail(&stab->list, &qdisc_stab_list);
397         spin_unlock(&qdisc_stab_lock);
398
399         return stab;
400 }
401
402 static void stab_kfree_rcu(struct rcu_head *head)
403 {
404         kfree(container_of(head, struct qdisc_size_table, rcu));
405 }
406
407 void qdisc_put_stab(struct qdisc_size_table *tab)
408 {
409         if (!tab)
410                 return;
411
412         spin_lock(&qdisc_stab_lock);
413
414         if (--tab->refcnt == 0) {
415                 list_del(&tab->list);
416                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
417         }
418
419         spin_unlock(&qdisc_stab_lock);
420 }
421 EXPORT_SYMBOL(qdisc_put_stab);
422
423 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
424 {
425         struct nlattr *nest;
426
427         nest = nla_nest_start(skb, TCA_STAB);
428         if (nest == NULL)
429                 goto nla_put_failure;
430         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
431                 goto nla_put_failure;
432         nla_nest_end(skb, nest);
433
434         return skb->len;
435
436 nla_put_failure:
437         return -1;
438 }
439
440 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
441 {
442         int pkt_len, slot;
443
444         pkt_len = skb->len + stab->szopts.overhead;
445         if (unlikely(!stab->szopts.tsize))
446                 goto out;
447
448         slot = pkt_len + stab->szopts.cell_align;
449         if (unlikely(slot < 0))
450                 slot = 0;
451
452         slot >>= stab->szopts.cell_log;
453         if (likely(slot < stab->szopts.tsize))
454                 pkt_len = stab->data[slot];
455         else
456                 pkt_len = stab->data[stab->szopts.tsize - 1] *
457                                 (slot / stab->szopts.tsize) +
458                                 stab->data[slot % stab->szopts.tsize];
459
460         pkt_len <<= stab->szopts.size_log;
461 out:
462         if (unlikely(pkt_len < 1))
463                 pkt_len = 1;
464         qdisc_skb_cb(skb)->pkt_len = pkt_len;
465 }
466 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
467
468 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
469 {
470         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
471                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
472                         txt, qdisc->ops->id, qdisc->handle >> 16);
473                 qdisc->flags |= TCQ_F_WARN_NONWC;
474         }
475 }
476 EXPORT_SYMBOL(qdisc_warn_nonwc);
477
478 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
479 {
480         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
481                                                  timer);
482
483         qdisc_unthrottled(wd->qdisc);
484         __netif_schedule(qdisc_root(wd->qdisc));
485
486         return HRTIMER_NORESTART;
487 }
488
489 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
490 {
491         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
492         wd->timer.function = qdisc_watchdog;
493         wd->qdisc = qdisc;
494 }
495 EXPORT_SYMBOL(qdisc_watchdog_init);
496
497 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
498 {
499         if (test_bit(__QDISC_STATE_DEACTIVATED,
500                      &qdisc_root_sleeping(wd->qdisc)->state))
501                 return;
502
503         qdisc_throttled(wd->qdisc);
504
505         hrtimer_start(&wd->timer,
506                       ns_to_ktime(expires),
507                       HRTIMER_MODE_ABS);
508 }
509 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
510
511 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
512 {
513         hrtimer_cancel(&wd->timer);
514         qdisc_unthrottled(wd->qdisc);
515 }
516 EXPORT_SYMBOL(qdisc_watchdog_cancel);
517
518 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519 {
520         unsigned int size = n * sizeof(struct hlist_head), i;
521         struct hlist_head *h;
522
523         if (size <= PAGE_SIZE)
524                 h = kmalloc(size, GFP_KERNEL);
525         else
526                 h = (struct hlist_head *)
527                         __get_free_pages(GFP_KERNEL, get_order(size));
528
529         if (h != NULL) {
530                 for (i = 0; i < n; i++)
531                         INIT_HLIST_HEAD(&h[i]);
532         }
533         return h;
534 }
535
536 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537 {
538         unsigned int size = n * sizeof(struct hlist_head);
539
540         if (size <= PAGE_SIZE)
541                 kfree(h);
542         else
543                 free_pages((unsigned long)h, get_order(size));
544 }
545
546 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547 {
548         struct Qdisc_class_common *cl;
549         struct hlist_node *next;
550         struct hlist_head *nhash, *ohash;
551         unsigned int nsize, nmask, osize;
552         unsigned int i, h;
553
554         /* Rehash when load factor exceeds 0.75 */
555         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556                 return;
557         nsize = clhash->hashsize * 2;
558         nmask = nsize - 1;
559         nhash = qdisc_class_hash_alloc(nsize);
560         if (nhash == NULL)
561                 return;
562
563         ohash = clhash->hash;
564         osize = clhash->hashsize;
565
566         sch_tree_lock(sch);
567         for (i = 0; i < osize; i++) {
568                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
569                         h = qdisc_class_hash(cl->classid, nmask);
570                         hlist_add_head(&cl->hnode, &nhash[h]);
571                 }
572         }
573         clhash->hash     = nhash;
574         clhash->hashsize = nsize;
575         clhash->hashmask = nmask;
576         sch_tree_unlock(sch);
577
578         qdisc_class_hash_free(ohash, osize);
579 }
580 EXPORT_SYMBOL(qdisc_class_hash_grow);
581
582 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
583 {
584         unsigned int size = 4;
585
586         clhash->hash = qdisc_class_hash_alloc(size);
587         if (clhash->hash == NULL)
588                 return -ENOMEM;
589         clhash->hashsize  = size;
590         clhash->hashmask  = size - 1;
591         clhash->hashelems = 0;
592         return 0;
593 }
594 EXPORT_SYMBOL(qdisc_class_hash_init);
595
596 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
597 {
598         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599 }
600 EXPORT_SYMBOL(qdisc_class_hash_destroy);
601
602 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
603                              struct Qdisc_class_common *cl)
604 {
605         unsigned int h;
606
607         INIT_HLIST_NODE(&cl->hnode);
608         h = qdisc_class_hash(cl->classid, clhash->hashmask);
609         hlist_add_head(&cl->hnode, &clhash->hash[h]);
610         clhash->hashelems++;
611 }
612 EXPORT_SYMBOL(qdisc_class_hash_insert);
613
614 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
615                              struct Qdisc_class_common *cl)
616 {
617         hlist_del(&cl->hnode);
618         clhash->hashelems--;
619 }
620 EXPORT_SYMBOL(qdisc_class_hash_remove);
621
622 /* Allocate an unique handle from space managed by kernel
623  * Possible range is [8000-FFFF]:0000 (0x8000 values)
624  */
625 static u32 qdisc_alloc_handle(struct net_device *dev)
626 {
627         int i = 0x8000;
628         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629
630         do {
631                 autohandle += TC_H_MAKE(0x10000U, 0);
632                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633                         autohandle = TC_H_MAKE(0x80000000U, 0);
634                 if (!qdisc_lookup(dev, autohandle))
635                         return autohandle;
636                 cond_resched();
637         } while (--i > 0);
638
639         return 0;
640 }
641
642 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643 {
644         const struct Qdisc_class_ops *cops;
645         unsigned long cl;
646         u32 parentid;
647
648         if (n == 0)
649                 return;
650         while ((parentid = sch->parent)) {
651                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652                         return;
653
654                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655                 if (sch == NULL) {
656                         WARN_ON(parentid != TC_H_ROOT);
657                         return;
658                 }
659                 cops = sch->ops->cl_ops;
660                 if (cops->qlen_notify) {
661                         cl = cops->get(sch, parentid);
662                         cops->qlen_notify(sch, cl);
663                         cops->put(sch, cl);
664                 }
665                 sch->q.qlen -= n;
666         }
667 }
668 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
669
670 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671                                struct nlmsghdr *n, u32 clid,
672                                struct Qdisc *old, struct Qdisc *new)
673 {
674         if (new || old)
675                 qdisc_notify(net, skb, n, clid, old, new);
676
677         if (old)
678                 qdisc_destroy(old);
679 }
680
681 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
682  * to device "dev".
683  *
684  * When appropriate send a netlink notification using 'skb'
685  * and "n".
686  *
687  * On success, destroy old qdisc.
688  */
689
690 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692                        struct Qdisc *new, struct Qdisc *old)
693 {
694         struct Qdisc *q = old;
695         struct net *net = dev_net(dev);
696         int err = 0;
697
698         if (parent == NULL) {
699                 unsigned int i, num_q, ingress;
700
701                 ingress = 0;
702                 num_q = dev->num_tx_queues;
703                 if ((q && q->flags & TCQ_F_INGRESS) ||
704                     (new && new->flags & TCQ_F_INGRESS)) {
705                         num_q = 1;
706                         ingress = 1;
707                         if (!dev_ingress_queue(dev))
708                                 return -ENOENT;
709                 }
710
711                 if (dev->flags & IFF_UP)
712                         dev_deactivate(dev);
713
714                 if (new && new->ops->attach) {
715                         new->ops->attach(new);
716                         num_q = 0;
717                 }
718
719                 for (i = 0; i < num_q; i++) {
720                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721
722                         if (!ingress)
723                                 dev_queue = netdev_get_tx_queue(dev, i);
724
725                         old = dev_graft_qdisc(dev_queue, new);
726                         if (new && i > 0)
727                                 atomic_inc(&new->refcnt);
728
729                         if (!ingress)
730                                 qdisc_destroy(old);
731                 }
732
733                 if (!ingress) {
734                         notify_and_destroy(net, skb, n, classid,
735                                            dev->qdisc, new);
736                         if (new && !new->ops->attach)
737                                 atomic_inc(&new->refcnt);
738                         dev->qdisc = new ? : &noop_qdisc;
739                 } else {
740                         notify_and_destroy(net, skb, n, classid, old, new);
741                 }
742
743                 if (dev->flags & IFF_UP)
744                         dev_activate(dev);
745         } else {
746                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747
748                 err = -EOPNOTSUPP;
749                 if (cops && cops->graft) {
750                         unsigned long cl = cops->get(parent, classid);
751                         if (cl) {
752                                 err = cops->graft(parent, cl, new, &old);
753                                 cops->put(parent, cl);
754                         } else
755                                 err = -ENOENT;
756                 }
757                 if (!err)
758                         notify_and_destroy(net, skb, n, classid, old, new);
759         }
760         return err;
761 }
762
763 /* lockdep annotation is needed for ingress; egress gets it only for name */
764 static struct lock_class_key qdisc_tx_lock;
765 static struct lock_class_key qdisc_rx_lock;
766
767 /*
768    Allocate and initialize new qdisc.
769
770    Parameters are passed via opt.
771  */
772
773 static struct Qdisc *
774 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775              struct Qdisc *p, u32 parent, u32 handle,
776              struct nlattr **tca, int *errp)
777 {
778         int err;
779         struct nlattr *kind = tca[TCA_KIND];
780         struct Qdisc *sch;
781         struct Qdisc_ops *ops;
782         struct qdisc_size_table *stab;
783
784         ops = qdisc_lookup_ops(kind);
785 #ifdef CONFIG_MODULES
786         if (ops == NULL && kind != NULL) {
787                 char name[IFNAMSIZ];
788                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789                         /* We dropped the RTNL semaphore in order to
790                          * perform the module load.  So, even if we
791                          * succeeded in loading the module we have to
792                          * tell the caller to replay the request.  We
793                          * indicate this using -EAGAIN.
794                          * We replay the request because the device may
795                          * go away in the mean time.
796                          */
797                         rtnl_unlock();
798                         request_module("sch_%s", name);
799                         rtnl_lock();
800                         ops = qdisc_lookup_ops(kind);
801                         if (ops != NULL) {
802                                 /* We will try again qdisc_lookup_ops,
803                                  * so don't keep a reference.
804                                  */
805                                 module_put(ops->owner);
806                                 err = -EAGAIN;
807                                 goto err_out;
808                         }
809                 }
810         }
811 #endif
812
813         err = -ENOENT;
814         if (ops == NULL)
815                 goto err_out;
816
817         sch = qdisc_alloc(dev_queue, ops);
818         if (IS_ERR(sch)) {
819                 err = PTR_ERR(sch);
820                 goto err_out2;
821         }
822
823         sch->parent = parent;
824
825         if (handle == TC_H_INGRESS) {
826                 sch->flags |= TCQ_F_INGRESS;
827                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
828                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829         } else {
830                 if (handle == 0) {
831                         handle = qdisc_alloc_handle(dev);
832                         err = -ENOMEM;
833                         if (handle == 0)
834                                 goto err_out3;
835                 }
836                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837                 if (!netif_is_multiqueue(dev))
838                         sch->flags |= TCQ_F_ONETXQUEUE;
839         }
840
841         sch->handle = handle;
842
843         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
844                 if (tca[TCA_STAB]) {
845                         stab = qdisc_get_stab(tca[TCA_STAB]);
846                         if (IS_ERR(stab)) {
847                                 err = PTR_ERR(stab);
848                                 goto err_out4;
849                         }
850                         rcu_assign_pointer(sch->stab, stab);
851                 }
852                 if (tca[TCA_RATE]) {
853                         spinlock_t *root_lock;
854
855                         err = -EOPNOTSUPP;
856                         if (sch->flags & TCQ_F_MQROOT)
857                                 goto err_out4;
858
859                         if ((sch->parent != TC_H_ROOT) &&
860                             !(sch->flags & TCQ_F_INGRESS) &&
861                             (!p || !(p->flags & TCQ_F_MQROOT)))
862                                 root_lock = qdisc_root_sleeping_lock(sch);
863                         else
864                                 root_lock = qdisc_lock(sch);
865
866                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
867                                                 root_lock, tca[TCA_RATE]);
868                         if (err)
869                                 goto err_out4;
870                 }
871
872                 qdisc_list_add(sch);
873
874                 return sch;
875         }
876 err_out3:
877         dev_put(dev);
878         kfree((char *) sch - sch->padded);
879 err_out2:
880         module_put(ops->owner);
881 err_out:
882         *errp = err;
883         return NULL;
884
885 err_out4:
886         /*
887          * Any broken qdiscs that would require a ops->reset() here?
888          * The qdisc was never in action so it shouldn't be necessary.
889          */
890         qdisc_put_stab(rtnl_dereference(sch->stab));
891         if (ops->destroy)
892                 ops->destroy(sch);
893         goto err_out3;
894 }
895
896 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
897 {
898         struct qdisc_size_table *ostab, *stab = NULL;
899         int err = 0;
900
901         if (tca[TCA_OPTIONS]) {
902                 if (sch->ops->change == NULL)
903                         return -EINVAL;
904                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
905                 if (err)
906                         return err;
907         }
908
909         if (tca[TCA_STAB]) {
910                 stab = qdisc_get_stab(tca[TCA_STAB]);
911                 if (IS_ERR(stab))
912                         return PTR_ERR(stab);
913         }
914
915         ostab = rtnl_dereference(sch->stab);
916         rcu_assign_pointer(sch->stab, stab);
917         qdisc_put_stab(ostab);
918
919         if (tca[TCA_RATE]) {
920                 /* NB: ignores errors from replace_estimator
921                    because change can't be undone. */
922                 if (sch->flags & TCQ_F_MQROOT)
923                         goto out;
924                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
925                                             qdisc_root_sleeping_lock(sch),
926                                             tca[TCA_RATE]);
927         }
928 out:
929         return 0;
930 }
931
932 struct check_loop_arg {
933         struct qdisc_walker     w;
934         struct Qdisc            *p;
935         int                     depth;
936 };
937
938 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
939
940 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
941 {
942         struct check_loop_arg   arg;
943
944         if (q->ops->cl_ops == NULL)
945                 return 0;
946
947         arg.w.stop = arg.w.skip = arg.w.count = 0;
948         arg.w.fn = check_loop_fn;
949         arg.depth = depth;
950         arg.p = p;
951         q->ops->cl_ops->walk(q, &arg.w);
952         return arg.w.stop ? -ELOOP : 0;
953 }
954
955 static int
956 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
957 {
958         struct Qdisc *leaf;
959         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
960         struct check_loop_arg *arg = (struct check_loop_arg *)w;
961
962         leaf = cops->leaf(q, cl);
963         if (leaf) {
964                 if (leaf == arg->p || arg->depth > 7)
965                         return -ELOOP;
966                 return check_loop(leaf, arg->p, arg->depth + 1);
967         }
968         return 0;
969 }
970
971 /*
972  * Delete/get qdisc.
973  */
974
975 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
976 {
977         struct net *net = sock_net(skb->sk);
978         struct tcmsg *tcm = nlmsg_data(n);
979         struct nlattr *tca[TCA_MAX + 1];
980         struct net_device *dev;
981         u32 clid;
982         struct Qdisc *q = NULL;
983         struct Qdisc *p = NULL;
984         int err;
985
986         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
987                 return -EPERM;
988
989         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
990         if (err < 0)
991                 return err;
992
993         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
994         if (!dev)
995                 return -ENODEV;
996
997         clid = tcm->tcm_parent;
998         if (clid) {
999                 if (clid != TC_H_ROOT) {
1000                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1001                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1002                                 if (!p)
1003                                         return -ENOENT;
1004                                 q = qdisc_leaf(p, clid);
1005                         } else if (dev_ingress_queue(dev)) {
1006                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1007                         }
1008                 } else {
1009                         q = dev->qdisc;
1010                 }
1011                 if (!q)
1012                         return -ENOENT;
1013
1014                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1015                         return -EINVAL;
1016         } else {
1017                 q = qdisc_lookup(dev, tcm->tcm_handle);
1018                 if (!q)
1019                         return -ENOENT;
1020         }
1021
1022         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1023                 return -EINVAL;
1024
1025         if (n->nlmsg_type == RTM_DELQDISC) {
1026                 if (!clid)
1027                         return -EINVAL;
1028                 if (q->handle == 0)
1029                         return -ENOENT;
1030                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1031                 if (err != 0)
1032                         return err;
1033         } else {
1034                 qdisc_notify(net, skb, n, clid, NULL, q);
1035         }
1036         return 0;
1037 }
1038
1039 /*
1040  * Create/change qdisc.
1041  */
1042
1043 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1044 {
1045         struct net *net = sock_net(skb->sk);
1046         struct tcmsg *tcm;
1047         struct nlattr *tca[TCA_MAX + 1];
1048         struct net_device *dev;
1049         u32 clid;
1050         struct Qdisc *q, *p;
1051         int err;
1052
1053         if (!capable(CAP_NET_ADMIN))
1054                 return -EPERM;
1055
1056 replay:
1057         /* Reinit, just in case something touches this. */
1058         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1059         if (err < 0)
1060                 return err;
1061
1062         tcm = nlmsg_data(n);
1063         clid = tcm->tcm_parent;
1064         q = p = NULL;
1065
1066         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1067         if (!dev)
1068                 return -ENODEV;
1069
1070
1071         if (clid) {
1072                 if (clid != TC_H_ROOT) {
1073                         if (clid != TC_H_INGRESS) {
1074                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1075                                 if (!p)
1076                                         return -ENOENT;
1077                                 q = qdisc_leaf(p, clid);
1078                         } else if (dev_ingress_queue_create(dev)) {
1079                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1080                         }
1081                 } else {
1082                         q = dev->qdisc;
1083                 }
1084
1085                 /* It may be default qdisc, ignore it */
1086                 if (q && q->handle == 0)
1087                         q = NULL;
1088
1089                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1090                         if (tcm->tcm_handle) {
1091                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1092                                         return -EEXIST;
1093                                 if (TC_H_MIN(tcm->tcm_handle))
1094                                         return -EINVAL;
1095                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1096                                 if (!q)
1097                                         goto create_n_graft;
1098                                 if (n->nlmsg_flags & NLM_F_EXCL)
1099                                         return -EEXIST;
1100                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1101                                         return -EINVAL;
1102                                 if (q == p ||
1103                                     (p && check_loop(q, p, 0)))
1104                                         return -ELOOP;
1105                                 atomic_inc(&q->refcnt);
1106                                 goto graft;
1107                         } else {
1108                                 if (!q)
1109                                         goto create_n_graft;
1110
1111                                 /* This magic test requires explanation.
1112                                  *
1113                                  *   We know, that some child q is already
1114                                  *   attached to this parent and have choice:
1115                                  *   either to change it or to create/graft new one.
1116                                  *
1117                                  *   1. We are allowed to create/graft only
1118                                  *   if CREATE and REPLACE flags are set.
1119                                  *
1120                                  *   2. If EXCL is set, requestor wanted to say,
1121                                  *   that qdisc tcm_handle is not expected
1122                                  *   to exist, so that we choose create/graft too.
1123                                  *
1124                                  *   3. The last case is when no flags are set.
1125                                  *   Alas, it is sort of hole in API, we
1126                                  *   cannot decide what to do unambiguously.
1127                                  *   For now we select create/graft, if
1128                                  *   user gave KIND, which does not match existing.
1129                                  */
1130                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1131                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1132                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1133                                      (tca[TCA_KIND] &&
1134                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1135                                         goto create_n_graft;
1136                         }
1137                 }
1138         } else {
1139                 if (!tcm->tcm_handle)
1140                         return -EINVAL;
1141                 q = qdisc_lookup(dev, tcm->tcm_handle);
1142         }
1143
1144         /* Change qdisc parameters */
1145         if (q == NULL)
1146                 return -ENOENT;
1147         if (n->nlmsg_flags & NLM_F_EXCL)
1148                 return -EEXIST;
1149         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1150                 return -EINVAL;
1151         err = qdisc_change(q, tca);
1152         if (err == 0)
1153                 qdisc_notify(net, skb, n, clid, NULL, q);
1154         return err;
1155
1156 create_n_graft:
1157         if (!(n->nlmsg_flags & NLM_F_CREATE))
1158                 return -ENOENT;
1159         if (clid == TC_H_INGRESS) {
1160                 if (dev_ingress_queue(dev))
1161                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1162                                          tcm->tcm_parent, tcm->tcm_parent,
1163                                          tca, &err);
1164                 else
1165                         err = -ENOENT;
1166         } else {
1167                 struct netdev_queue *dev_queue;
1168
1169                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1170                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1171                 else if (p)
1172                         dev_queue = p->dev_queue;
1173                 else
1174                         dev_queue = netdev_get_tx_queue(dev, 0);
1175
1176                 q = qdisc_create(dev, dev_queue, p,
1177                                  tcm->tcm_parent, tcm->tcm_handle,
1178                                  tca, &err);
1179         }
1180         if (q == NULL) {
1181                 if (err == -EAGAIN)
1182                         goto replay;
1183                 return err;
1184         }
1185
1186 graft:
1187         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1188         if (err) {
1189                 if (q)
1190                         qdisc_destroy(q);
1191                 return err;
1192         }
1193
1194         return 0;
1195 }
1196
1197 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1198                          u32 portid, u32 seq, u16 flags, int event)
1199 {
1200         struct tcmsg *tcm;
1201         struct nlmsghdr  *nlh;
1202         unsigned char *b = skb_tail_pointer(skb);
1203         struct gnet_dump d;
1204         struct qdisc_size_table *stab;
1205
1206         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1207         if (!nlh)
1208                 goto out_nlmsg_trim;
1209         tcm = nlmsg_data(nlh);
1210         tcm->tcm_family = AF_UNSPEC;
1211         tcm->tcm__pad1 = 0;
1212         tcm->tcm__pad2 = 0;
1213         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1214         tcm->tcm_parent = clid;
1215         tcm->tcm_handle = q->handle;
1216         tcm->tcm_info = atomic_read(&q->refcnt);
1217         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1218                 goto nla_put_failure;
1219         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1220                 goto nla_put_failure;
1221         q->qstats.qlen = q->q.qlen;
1222
1223         stab = rtnl_dereference(q->stab);
1224         if (stab && qdisc_dump_stab(skb, stab) < 0)
1225                 goto nla_put_failure;
1226
1227         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1228                                          qdisc_root_sleeping_lock(q), &d) < 0)
1229                 goto nla_put_failure;
1230
1231         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1232                 goto nla_put_failure;
1233
1234         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1235             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1236             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1237                 goto nla_put_failure;
1238
1239         if (gnet_stats_finish_copy(&d) < 0)
1240                 goto nla_put_failure;
1241
1242         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1243         return skb->len;
1244
1245 out_nlmsg_trim:
1246 nla_put_failure:
1247         nlmsg_trim(skb, b);
1248         return -1;
1249 }
1250
1251 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1252 {
1253         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1254 }
1255
1256 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1257                         struct nlmsghdr *n, u32 clid,
1258                         struct Qdisc *old, struct Qdisc *new)
1259 {
1260         struct sk_buff *skb;
1261         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1262
1263         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1264         if (!skb)
1265                 return -ENOBUFS;
1266
1267         if (old && !tc_qdisc_dump_ignore(old)) {
1268                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1269                                   0, RTM_DELQDISC) < 0)
1270                         goto err_out;
1271         }
1272         if (new && !tc_qdisc_dump_ignore(new)) {
1273                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1274                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1275                         goto err_out;
1276         }
1277
1278         if (skb->len)
1279                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1280                                       n->nlmsg_flags & NLM_F_ECHO);
1281
1282 err_out:
1283         kfree_skb(skb);
1284         return -EINVAL;
1285 }
1286
1287 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1288                               struct netlink_callback *cb,
1289                               int *q_idx_p, int s_q_idx)
1290 {
1291         int ret = 0, q_idx = *q_idx_p;
1292         struct Qdisc *q;
1293
1294         if (!root)
1295                 return 0;
1296
1297         q = root;
1298         if (q_idx < s_q_idx) {
1299                 q_idx++;
1300         } else {
1301                 if (!tc_qdisc_dump_ignore(q) &&
1302                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1303                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1304                         goto done;
1305                 q_idx++;
1306         }
1307         list_for_each_entry(q, &root->list, list) {
1308                 if (q_idx < s_q_idx) {
1309                         q_idx++;
1310                         continue;
1311                 }
1312                 if (!tc_qdisc_dump_ignore(q) &&
1313                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1314                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1315                         goto done;
1316                 q_idx++;
1317         }
1318
1319 out:
1320         *q_idx_p = q_idx;
1321         return ret;
1322 done:
1323         ret = -1;
1324         goto out;
1325 }
1326
1327 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1328 {
1329         struct net *net = sock_net(skb->sk);
1330         int idx, q_idx;
1331         int s_idx, s_q_idx;
1332         struct net_device *dev;
1333
1334         s_idx = cb->args[0];
1335         s_q_idx = q_idx = cb->args[1];
1336
1337         rcu_read_lock();
1338         idx = 0;
1339         for_each_netdev_rcu(net, dev) {
1340                 struct netdev_queue *dev_queue;
1341
1342                 if (idx < s_idx)
1343                         goto cont;
1344                 if (idx > s_idx)
1345                         s_q_idx = 0;
1346                 q_idx = 0;
1347
1348                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1349                         goto done;
1350
1351                 dev_queue = dev_ingress_queue(dev);
1352                 if (dev_queue &&
1353                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1354                                        &q_idx, s_q_idx) < 0)
1355                         goto done;
1356
1357 cont:
1358                 idx++;
1359         }
1360
1361 done:
1362         rcu_read_unlock();
1363
1364         cb->args[0] = idx;
1365         cb->args[1] = q_idx;
1366
1367         return skb->len;
1368 }
1369
1370
1371
1372 /************************************************
1373  *      Traffic classes manipulation.           *
1374  ************************************************/
1375
1376
1377
1378 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1379 {
1380         struct net *net = sock_net(skb->sk);
1381         struct tcmsg *tcm = nlmsg_data(n);
1382         struct nlattr *tca[TCA_MAX + 1];
1383         struct net_device *dev;
1384         struct Qdisc *q = NULL;
1385         const struct Qdisc_class_ops *cops;
1386         unsigned long cl = 0;
1387         unsigned long new_cl;
1388         u32 portid;
1389         u32 clid;
1390         u32 qid;
1391         int err;
1392
1393         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1394                 return -EPERM;
1395
1396         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1397         if (err < 0)
1398                 return err;
1399
1400         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1401         if (!dev)
1402                 return -ENODEV;
1403
1404         /*
1405            parent == TC_H_UNSPEC - unspecified parent.
1406            parent == TC_H_ROOT   - class is root, which has no parent.
1407            parent == X:0         - parent is root class.
1408            parent == X:Y         - parent is a node in hierarchy.
1409            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1410
1411            handle == 0:0         - generate handle from kernel pool.
1412            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1413            handle == X:Y         - clear.
1414            handle == X:0         - root class.
1415          */
1416
1417         /* Step 1. Determine qdisc handle X:0 */
1418
1419         portid = tcm->tcm_parent;
1420         clid = tcm->tcm_handle;
1421         qid = TC_H_MAJ(clid);
1422
1423         if (portid != TC_H_ROOT) {
1424                 u32 qid1 = TC_H_MAJ(portid);
1425
1426                 if (qid && qid1) {
1427                         /* If both majors are known, they must be identical. */
1428                         if (qid != qid1)
1429                                 return -EINVAL;
1430                 } else if (qid1) {
1431                         qid = qid1;
1432                 } else if (qid == 0)
1433                         qid = dev->qdisc->handle;
1434
1435                 /* Now qid is genuine qdisc handle consistent
1436                  * both with parent and child.
1437                  *
1438                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1439                  */
1440                 if (portid)
1441                         portid = TC_H_MAKE(qid, portid);
1442         } else {
1443                 if (qid == 0)
1444                         qid = dev->qdisc->handle;
1445         }
1446
1447         /* OK. Locate qdisc */
1448         q = qdisc_lookup(dev, qid);
1449         if (!q)
1450                 return -ENOENT;
1451
1452         /* An check that it supports classes */
1453         cops = q->ops->cl_ops;
1454         if (cops == NULL)
1455                 return -EINVAL;
1456
1457         /* Now try to get class */
1458         if (clid == 0) {
1459                 if (portid == TC_H_ROOT)
1460                         clid = qid;
1461         } else
1462                 clid = TC_H_MAKE(qid, clid);
1463
1464         if (clid)
1465                 cl = cops->get(q, clid);
1466
1467         if (cl == 0) {
1468                 err = -ENOENT;
1469                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1470                     !(n->nlmsg_flags & NLM_F_CREATE))
1471                         goto out;
1472         } else {
1473                 switch (n->nlmsg_type) {
1474                 case RTM_NEWTCLASS:
1475                         err = -EEXIST;
1476                         if (n->nlmsg_flags & NLM_F_EXCL)
1477                                 goto out;
1478                         break;
1479                 case RTM_DELTCLASS:
1480                         err = -EOPNOTSUPP;
1481                         if (cops->delete)
1482                                 err = cops->delete(q, cl);
1483                         if (err == 0)
1484                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1485                         goto out;
1486                 case RTM_GETTCLASS:
1487                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1488                         goto out;
1489                 default:
1490                         err = -EINVAL;
1491                         goto out;
1492                 }
1493         }
1494
1495         new_cl = cl;
1496         err = -EOPNOTSUPP;
1497         if (cops->change)
1498                 err = cops->change(q, clid, portid, tca, &new_cl);
1499         if (err == 0)
1500                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1501
1502 out:
1503         if (cl)
1504                 cops->put(q, cl);
1505
1506         return err;
1507 }
1508
1509
1510 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1511                           unsigned long cl,
1512                           u32 portid, u32 seq, u16 flags, int event)
1513 {
1514         struct tcmsg *tcm;
1515         struct nlmsghdr  *nlh;
1516         unsigned char *b = skb_tail_pointer(skb);
1517         struct gnet_dump d;
1518         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1519
1520         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1521         if (!nlh)
1522                 goto out_nlmsg_trim;
1523         tcm = nlmsg_data(nlh);
1524         tcm->tcm_family = AF_UNSPEC;
1525         tcm->tcm__pad1 = 0;
1526         tcm->tcm__pad2 = 0;
1527         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1528         tcm->tcm_parent = q->handle;
1529         tcm->tcm_handle = q->handle;
1530         tcm->tcm_info = 0;
1531         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1532                 goto nla_put_failure;
1533         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1534                 goto nla_put_failure;
1535
1536         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1537                                          qdisc_root_sleeping_lock(q), &d) < 0)
1538                 goto nla_put_failure;
1539
1540         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1541                 goto nla_put_failure;
1542
1543         if (gnet_stats_finish_copy(&d) < 0)
1544                 goto nla_put_failure;
1545
1546         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1547         return skb->len;
1548
1549 out_nlmsg_trim:
1550 nla_put_failure:
1551         nlmsg_trim(skb, b);
1552         return -1;
1553 }
1554
1555 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1556                          struct nlmsghdr *n, struct Qdisc *q,
1557                          unsigned long cl, int event)
1558 {
1559         struct sk_buff *skb;
1560         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1561
1562         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1563         if (!skb)
1564                 return -ENOBUFS;
1565
1566         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1567                 kfree_skb(skb);
1568                 return -EINVAL;
1569         }
1570
1571         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1572                               n->nlmsg_flags & NLM_F_ECHO);
1573 }
1574
1575 struct qdisc_dump_args {
1576         struct qdisc_walker     w;
1577         struct sk_buff          *skb;
1578         struct netlink_callback *cb;
1579 };
1580
1581 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1582 {
1583         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1584
1585         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1586                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1587 }
1588
1589 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1590                                 struct tcmsg *tcm, struct netlink_callback *cb,
1591                                 int *t_p, int s_t)
1592 {
1593         struct qdisc_dump_args arg;
1594
1595         if (tc_qdisc_dump_ignore(q) ||
1596             *t_p < s_t || !q->ops->cl_ops ||
1597             (tcm->tcm_parent &&
1598              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1599                 (*t_p)++;
1600                 return 0;
1601         }
1602         if (*t_p > s_t)
1603                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1604         arg.w.fn = qdisc_class_dump;
1605         arg.skb = skb;
1606         arg.cb = cb;
1607         arg.w.stop  = 0;
1608         arg.w.skip = cb->args[1];
1609         arg.w.count = 0;
1610         q->ops->cl_ops->walk(q, &arg.w);
1611         cb->args[1] = arg.w.count;
1612         if (arg.w.stop)
1613                 return -1;
1614         (*t_p)++;
1615         return 0;
1616 }
1617
1618 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1619                                struct tcmsg *tcm, struct netlink_callback *cb,
1620                                int *t_p, int s_t)
1621 {
1622         struct Qdisc *q;
1623
1624         if (!root)
1625                 return 0;
1626
1627         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1628                 return -1;
1629
1630         list_for_each_entry(q, &root->list, list) {
1631                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1632                         return -1;
1633         }
1634
1635         return 0;
1636 }
1637
1638 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1639 {
1640         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1641         struct net *net = sock_net(skb->sk);
1642         struct netdev_queue *dev_queue;
1643         struct net_device *dev;
1644         int t, s_t;
1645
1646         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1647                 return 0;
1648         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1649         if (!dev)
1650                 return 0;
1651
1652         s_t = cb->args[0];
1653         t = 0;
1654
1655         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1656                 goto done;
1657
1658         dev_queue = dev_ingress_queue(dev);
1659         if (dev_queue &&
1660             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1661                                 &t, s_t) < 0)
1662                 goto done;
1663
1664 done:
1665         cb->args[0] = t;
1666
1667         dev_put(dev);
1668         return skb->len;
1669 }
1670
1671 /* Main classifier routine: scans classifier chain attached
1672  * to this qdisc, (optionally) tests for protocol and asks
1673  * specific classifiers.
1674  */
1675 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1676                        struct tcf_result *res)
1677 {
1678         __be16 protocol = skb->protocol;
1679         int err;
1680
1681         for (; tp; tp = tp->next) {
1682                 if (tp->protocol != protocol &&
1683                     tp->protocol != htons(ETH_P_ALL))
1684                         continue;
1685                 err = tp->classify(skb, tp, res);
1686
1687                 if (err >= 0) {
1688 #ifdef CONFIG_NET_CLS_ACT
1689                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1690                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1691 #endif
1692                         return err;
1693                 }
1694         }
1695         return -1;
1696 }
1697 EXPORT_SYMBOL(tc_classify_compat);
1698
1699 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1700                 struct tcf_result *res)
1701 {
1702         int err = 0;
1703 #ifdef CONFIG_NET_CLS_ACT
1704         const struct tcf_proto *otp = tp;
1705 reclassify:
1706 #endif
1707
1708         err = tc_classify_compat(skb, tp, res);
1709 #ifdef CONFIG_NET_CLS_ACT
1710         if (err == TC_ACT_RECLASSIFY) {
1711                 u32 verd = G_TC_VERD(skb->tc_verd);
1712                 tp = otp;
1713
1714                 if (verd++ >= MAX_REC_LOOP) {
1715                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1716                                                tp->q->ops->id,
1717                                                tp->prio & 0xffff,
1718                                                ntohs(tp->protocol));
1719                         return TC_ACT_SHOT;
1720                 }
1721                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1722                 goto reclassify;
1723         }
1724 #endif
1725         return err;
1726 }
1727 EXPORT_SYMBOL(tc_classify);
1728
1729 void tcf_destroy(struct tcf_proto *tp)
1730 {
1731         tp->ops->destroy(tp);
1732         module_put(tp->ops->owner);
1733         kfree(tp);
1734 }
1735
1736 void tcf_destroy_chain(struct tcf_proto **fl)
1737 {
1738         struct tcf_proto *tp;
1739
1740         while ((tp = *fl) != NULL) {
1741                 *fl = tp->next;
1742                 tcf_destroy(tp);
1743         }
1744 }
1745 EXPORT_SYMBOL(tcf_destroy_chain);
1746
1747 #ifdef CONFIG_PROC_FS
1748 static int psched_show(struct seq_file *seq, void *v)
1749 {
1750         struct timespec ts;
1751
1752         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1753         seq_printf(seq, "%08x %08x %08x %08x\n",
1754                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1755                    1000000,
1756                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1757
1758         return 0;
1759 }
1760
1761 static int psched_open(struct inode *inode, struct file *file)
1762 {
1763         return single_open(file, psched_show, NULL);
1764 }
1765
1766 static const struct file_operations psched_fops = {
1767         .owner = THIS_MODULE,
1768         .open = psched_open,
1769         .read  = seq_read,
1770         .llseek = seq_lseek,
1771         .release = single_release,
1772 };
1773
1774 static int __net_init psched_net_init(struct net *net)
1775 {
1776         struct proc_dir_entry *e;
1777
1778         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1779         if (e == NULL)
1780                 return -ENOMEM;
1781
1782         return 0;
1783 }
1784
1785 static void __net_exit psched_net_exit(struct net *net)
1786 {
1787         remove_proc_entry("psched", net->proc_net);
1788 }
1789 #else
1790 static int __net_init psched_net_init(struct net *net)
1791 {
1792         return 0;
1793 }
1794
1795 static void __net_exit psched_net_exit(struct net *net)
1796 {
1797 }
1798 #endif
1799
1800 static struct pernet_operations psched_net_ops = {
1801         .init = psched_net_init,
1802         .exit = psched_net_exit,
1803 };
1804
1805 static int __init pktsched_init(void)
1806 {
1807         int err;
1808
1809         err = register_pernet_subsys(&psched_net_ops);
1810         if (err) {
1811                 pr_err("pktsched_init: "
1812                        "cannot initialize per netns operations\n");
1813                 return err;
1814         }
1815
1816         register_qdisc(&pfifo_qdisc_ops);
1817         register_qdisc(&bfifo_qdisc_ops);
1818         register_qdisc(&pfifo_head_drop_qdisc_ops);
1819         register_qdisc(&mq_qdisc_ops);
1820
1821         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1822         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1823         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1824         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1825         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1826         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1827
1828         return 0;
1829 }
1830
1831 subsys_initcall(pktsched_init);