net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         struct tcf_block *block;
 130         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133         struct red_parms *red_parms;
 134         struct tc_sfqred_stats stats;
 135         struct sfq_slot *tail;          /* current slot in round */
 136
 137         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                         /* Linked lists of slots, indexed by depth
 139                                          * dep[0] : list of unused flows
 140                                          * dep[1] : list of flows with 1 packet
 141                                          * dep[X] : list of flows with X packets
 142                                          */
 143
 144         unsigned int    maxflows;       /* number of flows in flows array */
 145         int             perturb_period;
 146         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147         struct timer_list perturb_timer;
 148 };
 149
 150 /*
 151  * sfq_head are either in a sfq_slot or in dep[] array
 152  */
 153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 154 {
 155         if (val < SFQ_MAX_FLOWS)
 156                 return &q->slots[val].dep;
 157         return &q->dep[val - SFQ_MAX_FLOWS];
 158 }
 159
 160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 161                              const struct sk_buff *skb)
 162 {
 163         return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 164 }
 165
 166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 167                                  int *qerr)
 168 {
 169         struct sfq_sched_data *q = qdisc_priv(sch);
 170         struct tcf_result res;
 171         struct tcf_proto *fl;
 172         int result;
 173
 174         if (TC_H_MAJ(skb->priority) == sch->handle &&
 175             TC_H_MIN(skb->priority) > 0 &&
 176             TC_H_MIN(skb->priority) <= q->divisor)
 177                 return TC_H_MIN(skb->priority);
 178
 179         fl = rcu_dereference_bh(q->filter_list);
 180         if (!fl)
 181                 return sfq_hash(q, skb) + 1;
 182
 183         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 184         result = tcf_classify(skb, fl, &res, false);
 185         if (result >= 0) {
 186 #ifdef CONFIG_NET_CLS_ACT
 187                 switch (result) {
 188                 case TC_ACT_STOLEN:
 189                 case TC_ACT_QUEUED:
 190                 case TC_ACT_TRAP:
 191                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 192                 case TC_ACT_SHOT:
 193                         return 0;
 194                 }
 195 #endif
 196                 if (TC_H_MIN(res.classid) <= q->divisor)
 197                         return TC_H_MIN(res.classid);
 198         }
 199         return 0;
 200 }
 201
 202 /*
 203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 204  */
 205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 206 {
 207         sfq_index p, n;
 208         struct sfq_slot *slot = &q->slots[x];
 209         int qlen = slot->qlen;
 210
 211         p = qlen + SFQ_MAX_FLOWS;
 212         n = q->dep[qlen].next;
 213
 214         slot->dep.next = n;
 215         slot->dep.prev = p;
 216
 217         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 218         sfq_dep_head(q, n)->prev = x;
 219 }
 220
 221 #define sfq_unlink(q, x, n, p)                  \
 222         do {                                    \
 223                 n = q->slots[x].dep.next;       \
 224                 p = q->slots[x].dep.prev;       \
 225                 sfq_dep_head(q, p)->next = n;   \
 226                 sfq_dep_head(q, n)->prev = p;   \
 227         } while (0)
 228
 229
 230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 231 {
 232         sfq_index p, n;
 233         int d;
 234
 235         sfq_unlink(q, x, n, p);
 236
 237         d = q->slots[x].qlen--;
 238         if (n == p && q->cur_depth == d)
 239                 q->cur_depth--;
 240         sfq_link(q, x);
 241 }
 242
 243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 244 {
 245         sfq_index p, n;
 246         int d;
 247
 248         sfq_unlink(q, x, n, p);
 249
 250         d = ++q->slots[x].qlen;
 251         if (q->cur_depth < d)
 252                 q->cur_depth = d;
 253         sfq_link(q, x);
 254 }
 255
 256 /* helper functions : might be changed when/if skb use a standard list_head */
 257
 258 /* remove one skb from tail of slot queue */
 259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 260 {
 261         struct sk_buff *skb = slot->skblist_prev;
 262
 263         slot->skblist_prev = skb->prev;
 264         skb->prev->next = (struct sk_buff *)slot;
 265         skb->next = skb->prev = NULL;
 266         return skb;
 267 }
 268
 269 /* remove one skb from head of slot queue */
 270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 271 {
 272         struct sk_buff *skb = slot->skblist_next;
 273
 274         slot->skblist_next = skb->next;
 275         skb->next->prev = (struct sk_buff *)slot;
 276         skb->next = skb->prev = NULL;
 277         return skb;
 278 }
 279
 280 static inline void slot_queue_init(struct sfq_slot *slot)
 281 {
 282         memset(slot, 0, sizeof(*slot));
 283         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 284 }
 285
 286 /* add skb to slot queue (tail add) */
 287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 288 {
 289         skb->prev = slot->skblist_prev;
 290         skb->next = (struct sk_buff *)slot;
 291         slot->skblist_prev->next = skb;
 292         slot->skblist_prev = skb;
 293 }
 294
 295 static unsigned int sfq_drop(struct Qdisc *sch)
 296 {
 297         struct sfq_sched_data *q = qdisc_priv(sch);
 298         sfq_index x, d = q->cur_depth;
 299         struct sk_buff *skb;
 300         unsigned int len;
 301         struct sfq_slot *slot;
 302
 303         /* Queue is full! Find the longest slot and drop tail packet from it */
 304         if (d > 1) {
 305                 x = q->dep[d].next;
 306                 slot = &q->slots[x];
 307 drop:
 308                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 309                 len = qdisc_pkt_len(skb);
 310                 slot->backlog -= len;
 311                 sfq_dec(q, x);
 312                 sch->q.qlen--;
 313                 qdisc_qstats_drop(sch);
 314                 qdisc_qstats_backlog_dec(sch, skb);
 315                 kfree_skb(skb);
 316                 return len;
 317         }
 318
 319         if (d == 1) {
 320                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 321                 x = q->tail->next;
 322                 slot = &q->slots[x];
 323                 q->tail->next = slot->next;
 324                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 325                 goto drop;
 326         }
 327
 328         return 0;
 329 }
 330
 331 /* Is ECN parameter configured */
 332 static int sfq_prob_mark(const struct sfq_sched_data *q)
 333 {
 334         return q->flags & TC_RED_ECN;
 335 }
 336
 337 /* Should packets over max threshold just be marked */
 338 static int sfq_hard_mark(const struct sfq_sched_data *q)
 339 {
 340         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 341 }
 342
 343 static int sfq_headdrop(const struct sfq_sched_data *q)
 344 {
 345         return q->headdrop;
 346 }
 347
 348 static int
 349 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 350 {
 351         struct sfq_sched_data *q = qdisc_priv(sch);
 352         unsigned int hash, dropped;
 353         sfq_index x, qlen;
 354         struct sfq_slot *slot;
 355         int uninitialized_var(ret);
 356         struct sk_buff *head;
 357         int delta;
 358
 359         hash = sfq_classify(skb, sch, &ret);
 360         if (hash == 0) {
 361                 if (ret & __NET_XMIT_BYPASS)
 362                         qdisc_qstats_drop(sch);
 363                 kfree_skb(skb);
 364                 return ret;
 365         }
 366         hash--;
 367
 368         x = q->ht[hash];
 369         slot = &q->slots[x];
 370         if (x == SFQ_EMPTY_SLOT) {
 371                 x = q->dep[0].next; /* get a free slot */
 372                 if (x >= SFQ_MAX_FLOWS)
 373                         return qdisc_drop(skb, sch, to_free);
 374                 q->ht[hash] = x;
 375                 slot = &q->slots[x];
 376                 slot->hash = hash;
 377                 slot->backlog = 0; /* should already be 0 anyway... */
 378                 red_set_vars(&slot->vars);
 379                 goto enqueue;
 380         }
 381         if (q->red_parms) {
 382                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 383                                                         &slot->vars,
 384                                                         slot->backlog);
 385                 switch (red_action(q->red_parms,
 386                                    &slot->vars,
 387                                    slot->vars.qavg)) {
 388                 case RED_DONT_MARK:
 389                         break;
 390
 391                 case RED_PROB_MARK:
 392                         qdisc_qstats_overlimit(sch);
 393                         if (sfq_prob_mark(q)) {
 394                                 /* We know we have at least one packet in queue */
 395                                 if (sfq_headdrop(q) &&
 396                                     INET_ECN_set_ce(slot->skblist_next)) {
 397                                         q->stats.prob_mark_head++;
 398                                         break;
 399                                 }
 400                                 if (INET_ECN_set_ce(skb)) {
 401                                         q->stats.prob_mark++;
 402                                         break;
 403                                 }
 404                         }
 405                         q->stats.prob_drop++;
 406                         goto congestion_drop;
 407
 408                 case RED_HARD_MARK:
 409                         qdisc_qstats_overlimit(sch);
 410                         if (sfq_hard_mark(q)) {
 411                                 /* We know we have at least one packet in queue */
 412                                 if (sfq_headdrop(q) &&
 413                                     INET_ECN_set_ce(slot->skblist_next)) {
 414                                         q->stats.forced_mark_head++;
 415                                         break;
 416                                 }
 417                                 if (INET_ECN_set_ce(skb)) {
 418                                         q->stats.forced_mark++;
 419                                         break;
 420                                 }
 421                         }
 422                         q->stats.forced_drop++;
 423                         goto congestion_drop;
 424                 }
 425         }
 426
 427         if (slot->qlen >= q->maxdepth) {
 428 congestion_drop:
 429                 if (!sfq_headdrop(q))
 430                         return qdisc_drop(skb, sch, to_free);
 431
 432                 /* We know we have at least one packet in queue */
 433                 head = slot_dequeue_head(slot);
 434                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 435                 sch->qstats.backlog -= delta;
 436                 slot->backlog -= delta;
 437                 qdisc_drop(head, sch, to_free);
 438
 439                 slot_queue_add(slot, skb);
 440                 qdisc_tree_reduce_backlog(sch, 0, delta);
 441                 return NET_XMIT_CN;
 442         }
 443
 444 enqueue:
 445         qdisc_qstats_backlog_inc(sch, skb);
 446         slot->backlog += qdisc_pkt_len(skb);
 447         slot_queue_add(slot, skb);
 448         sfq_inc(q, x);
 449         if (slot->qlen == 1) {          /* The flow is new */
 450                 if (q->tail == NULL) {  /* It is the first flow */
 451                         slot->next = x;
 452                 } else {
 453                         slot->next = q->tail->next;
 454                         q->tail->next = x;
 455                 }
 456                 /* We put this flow at the end of our flow list.
 457                  * This might sound unfair for a new flow to wait after old ones,
 458                  * but we could endup servicing new flows only, and freeze old ones.
 459                  */
 460                 q->tail = slot;
 461                 /* We could use a bigger initial quantum for new flows */
 462                 slot->allot = q->scaled_quantum;
 463         }
 464         if (++sch->q.qlen <= q->limit)
 465                 return NET_XMIT_SUCCESS;
 466
 467         qlen = slot->qlen;
 468         dropped = sfq_drop(sch);
 469         /* Return Congestion Notification only if we dropped a packet
 470          * from this flow.
 471          */
 472         if (qlen != slot->qlen) {
 473                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 474                 return NET_XMIT_CN;
 475         }
 476
 477         /* As we dropped a packet, better let upper stack know this */
 478         qdisc_tree_reduce_backlog(sch, 1, dropped);
 479         return NET_XMIT_SUCCESS;
 480 }
 481
 482 static struct sk_buff *
 483 sfq_dequeue(struct Qdisc *sch)
 484 {
 485         struct sfq_sched_data *q = qdisc_priv(sch);
 486         struct sk_buff *skb;
 487         sfq_index a, next_a;
 488         struct sfq_slot *slot;
 489
 490         /* No active slots */
 491         if (q->tail == NULL)
 492                 return NULL;
 493
 494 next_slot:
 495         a = q->tail->next;
 496         slot = &q->slots[a];
 497         if (slot->allot <= 0) {
 498                 q->tail = slot;
 499                 slot->allot += q->scaled_quantum;
 500                 goto next_slot;
 501         }
 502         skb = slot_dequeue_head(slot);
 503         sfq_dec(q, a);
 504         qdisc_bstats_update(sch, skb);
 505         sch->q.qlen--;
 506         qdisc_qstats_backlog_dec(sch, skb);
 507         slot->backlog -= qdisc_pkt_len(skb);
 508         /* Is the slot empty? */
 509         if (slot->qlen == 0) {
 510                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 511                 next_a = slot->next;
 512                 if (a == next_a) {
 513                         q->tail = NULL; /* no more active slots */
 514                         return skb;
 515                 }
 516                 q->tail->next = next_a;
 517         } else {
 518                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 519         }
 520         return skb;
 521 }
 522
 523 static void
 524 sfq_reset(struct Qdisc *sch)
 525 {
 526         struct sk_buff *skb;
 527
 528         while ((skb = sfq_dequeue(sch)) != NULL)
 529                 rtnl_kfree_skbs(skb, skb);
 530 }
 531
 532 /*
 533  * When q->perturbation is changed, we rehash all queued skbs
 534  * to avoid OOO (Out Of Order) effects.
 535  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 536  * counters.
 537  */
 538 static void sfq_rehash(struct Qdisc *sch)
 539 {
 540         struct sfq_sched_data *q = qdisc_priv(sch);
 541         struct sk_buff *skb;
 542         int i;
 543         struct sfq_slot *slot;
 544         struct sk_buff_head list;
 545         int dropped = 0;
 546         unsigned int drop_len = 0;
 547
 548         __skb_queue_head_init(&list);
 549
 550         for (i = 0; i < q->maxflows; i++) {
 551                 slot = &q->slots[i];
 552                 if (!slot->qlen)
 553                         continue;
 554                 while (slot->qlen) {
 555                         skb = slot_dequeue_head(slot);
 556                         sfq_dec(q, i);
 557                         __skb_queue_tail(&list, skb);
 558                 }
 559                 slot->backlog = 0;
 560                 red_set_vars(&slot->vars);
 561                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 562         }
 563         q->tail = NULL;
 564
 565         while ((skb = __skb_dequeue(&list)) != NULL) {
 566                 unsigned int hash = sfq_hash(q, skb);
 567                 sfq_index x = q->ht[hash];
 568
 569                 slot = &q->slots[x];
 570                 if (x == SFQ_EMPTY_SLOT) {
 571                         x = q->dep[0].next; /* get a free slot */
 572                         if (x >= SFQ_MAX_FLOWS) {
 573 drop:
 574                                 qdisc_qstats_backlog_dec(sch, skb);
 575                                 drop_len += qdisc_pkt_len(skb);
 576                                 kfree_skb(skb);
 577                                 dropped++;
 578                                 continue;
 579                         }
 580                         q->ht[hash] = x;
 581                         slot = &q->slots[x];
 582                         slot->hash = hash;
 583                 }
 584                 if (slot->qlen >= q->maxdepth)
 585                         goto drop;
 586                 slot_queue_add(slot, skb);
 587                 if (q->red_parms)
 588                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 589                                                         &slot->vars,
 590                                                         slot->backlog);
 591                 slot->backlog += qdisc_pkt_len(skb);
 592                 sfq_inc(q, x);
 593                 if (slot->qlen == 1) {          /* The flow is new */
 594                         if (q->tail == NULL) {  /* It is the first flow */
 595                                 slot->next = x;
 596                         } else {
 597                                 slot->next = q->tail->next;
 598                                 q->tail->next = x;
 599                         }
 600                         q->tail = slot;
 601                         slot->allot = q->scaled_quantum;
 602                 }
 603         }
 604         sch->q.qlen -= dropped;
 605         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 606 }
 607
 608 static void sfq_perturbation(unsigned long arg)
 609 {
 610         struct Qdisc *sch = (struct Qdisc *)arg;
 611         struct sfq_sched_data *q = qdisc_priv(sch);
 612         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 613
 614         spin_lock(root_lock);
 615         q->perturbation = prandom_u32();
 616         if (!q->filter_list && q->tail)
 617                 sfq_rehash(sch);
 618         spin_unlock(root_lock);
 619
 620         if (q->perturb_period)
 621                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 622 }
 623
 624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 625 {
 626         struct sfq_sched_data *q = qdisc_priv(sch);
 627         struct tc_sfq_qopt *ctl = nla_data(opt);
 628         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 629         unsigned int qlen, dropped = 0;
 630         struct red_parms *p = NULL;
 631
 632         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 633                 return -EINVAL;
 634         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 635                 ctl_v1 = nla_data(opt);
 636         if (ctl->divisor &&
 637             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 638                 return -EINVAL;
 639         if (ctl_v1 && ctl_v1->qth_min) {
 640                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 641                 if (!p)
 642                         return -ENOMEM;
 643         }
 644         sch_tree_lock(sch);
 645         if (ctl->quantum) {
 646                 q->quantum = ctl->quantum;
 647                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 648         }
 649         q->perturb_period = ctl->perturb_period * HZ;
 650         if (ctl->flows)
 651                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 652         if (ctl->divisor) {
 653                 q->divisor = ctl->divisor;
 654                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 655         }
 656         if (ctl_v1) {
 657                 if (ctl_v1->depth)
 658                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 659                 if (p) {
 660                         swap(q->red_parms, p);
 661                         red_set_parms(q->red_parms,
 662                                       ctl_v1->qth_min, ctl_v1->qth_max,
 663                                       ctl_v1->Wlog,
 664                                       ctl_v1->Plog, ctl_v1->Scell_log,
 665                                       NULL,
 666                                       ctl_v1->max_P);
 667                 }
 668                 q->flags = ctl_v1->flags;
 669                 q->headdrop = ctl_v1->headdrop;
 670         }
 671         if (ctl->limit) {
 672                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 673                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 674         }
 675
 676         qlen = sch->q.qlen;
 677         while (sch->q.qlen > q->limit)
 678                 dropped += sfq_drop(sch);
 679         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 680
 681         del_timer(&q->perturb_timer);
 682         if (q->perturb_period) {
 683                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 684                 q->perturbation = prandom_u32();
 685         }
 686         sch_tree_unlock(sch);
 687         kfree(p);
 688         return 0;
 689 }
 690
 691 static void *sfq_alloc(size_t sz)
 692 {
 693         return  kvmalloc(sz, GFP_KERNEL);
 694 }
 695
 696 static void sfq_free(void *addr)
 697 {
 698         kvfree(addr);
 699 }
 700
 701 static void sfq_destroy(struct Qdisc *sch)
 702 {
 703         struct sfq_sched_data *q = qdisc_priv(sch);
 704
 705         tcf_block_put(q->block);
 706         q->perturb_period = 0;
 707         del_timer_sync(&q->perturb_timer);
 708         sfq_free(q->ht);
 709         sfq_free(q->slots);
 710         kfree(q->red_parms);
 711 }
 712
 713 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 714 {
 715         struct sfq_sched_data *q = qdisc_priv(sch);
 716         int i;
 717         int err;
 718
 719         err = tcf_block_get(&q->block, &q->filter_list);
 720         if (err)
 721                 return err;
 722
 723         setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 724                                (unsigned long)sch);
 725
 726         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 727                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 728                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 729         }
 730
 731         q->limit = SFQ_MAX_DEPTH;
 732         q->maxdepth = SFQ_MAX_DEPTH;
 733         q->cur_depth = 0;
 734         q->tail = NULL;
 735         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 736         q->maxflows = SFQ_DEFAULT_FLOWS;
 737         q->quantum = psched_mtu(qdisc_dev(sch));
 738         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 739         q->perturb_period = 0;
 740         q->perturbation = prandom_u32();
 741
 742         if (opt) {
 743                 int err = sfq_change(sch, opt);
 744                 if (err)
 745                         return err;
 746         }
 747
 748         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 749         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 750         if (!q->ht || !q->slots) {
 751                 /* Note: sfq_destroy() will be called by our caller */
 752                 return -ENOMEM;
 753         }
 754
 755         for (i = 0; i < q->divisor; i++)
 756                 q->ht[i] = SFQ_EMPTY_SLOT;
 757
 758         for (i = 0; i < q->maxflows; i++) {
 759                 slot_queue_init(&q->slots[i]);
 760                 sfq_link(q, i);
 761         }
 762         if (q->limit >= 1)
 763                 sch->flags |= TCQ_F_CAN_BYPASS;
 764         else
 765                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 766         return 0;
 767 }
 768
 769 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 770 {
 771         struct sfq_sched_data *q = qdisc_priv(sch);
 772         unsigned char *b = skb_tail_pointer(skb);
 773         struct tc_sfq_qopt_v1 opt;
 774         struct red_parms *p = q->red_parms;
 775
 776         memset(&opt, 0, sizeof(opt));
 777         opt.v0.quantum  = q->quantum;
 778         opt.v0.perturb_period = q->perturb_period / HZ;
 779         opt.v0.limit    = q->limit;
 780         opt.v0.divisor  = q->divisor;
 781         opt.v0.flows    = q->maxflows;
 782         opt.depth       = q->maxdepth;
 783         opt.headdrop    = q->headdrop;
 784
 785         if (p) {
 786                 opt.qth_min     = p->qth_min >> p->Wlog;
 787                 opt.qth_max     = p->qth_max >> p->Wlog;
 788                 opt.Wlog        = p->Wlog;
 789                 opt.Plog        = p->Plog;
 790                 opt.Scell_log   = p->Scell_log;
 791                 opt.max_P       = p->max_P;
 792         }
 793         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 794         opt.flags       = q->flags;
 795
 796         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 797                 goto nla_put_failure;
 798
 799         return skb->len;
 800
 801 nla_put_failure:
 802         nlmsg_trim(skb, b);
 803         return -1;
 804 }
 805
 806 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 807 {
 808         return NULL;
 809 }
 810
 811 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 812 {
 813         return 0;
 814 }
 815
 816 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 817                               u32 classid)
 818 {
 819         /* we cannot bypass queue discipline anymore */
 820         sch->flags &= ~TCQ_F_CAN_BYPASS;
 821         return 0;
 822 }
 823
 824 static void sfq_put(struct Qdisc *q, unsigned long cl)
 825 {
 826 }
 827
 828 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
 829 {
 830         struct sfq_sched_data *q = qdisc_priv(sch);
 831
 832         if (cl)
 833                 return NULL;
 834         return q->block;
 835 }
 836
 837 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 838                           struct sk_buff *skb, struct tcmsg *tcm)
 839 {
 840         tcm->tcm_handle |= TC_H_MIN(cl);
 841         return 0;
 842 }
 843
 844 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 845                                 struct gnet_dump *d)
 846 {
 847         struct sfq_sched_data *q = qdisc_priv(sch);
 848         sfq_index idx = q->ht[cl - 1];
 849         struct gnet_stats_queue qs = { 0 };
 850         struct tc_sfq_xstats xstats = { 0 };
 851
 852         if (idx != SFQ_EMPTY_SLOT) {
 853                 const struct sfq_slot *slot = &q->slots[idx];
 854
 855                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 856                 qs.qlen = slot->qlen;
 857                 qs.backlog = slot->backlog;
 858         }
 859         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 860                 return -1;
 861         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 862 }
 863
 864 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 865 {
 866         struct sfq_sched_data *q = qdisc_priv(sch);
 867         unsigned int i;
 868
 869         if (arg->stop)
 870                 return;
 871
 872         for (i = 0; i < q->divisor; i++) {
 873                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 874                     arg->count < arg->skip) {
 875                         arg->count++;
 876                         continue;
 877                 }
 878                 if (arg->fn(sch, i + 1, arg) < 0) {
 879                         arg->stop = 1;
 880                         break;
 881                 }
 882                 arg->count++;
 883         }
 884 }
 885
 886 static const struct Qdisc_class_ops sfq_class_ops = {
 887         .leaf           =       sfq_leaf,
 888         .get            =       sfq_get,
 889         .put            =       sfq_put,
 890         .tcf_block      =       sfq_tcf_block,
 891         .bind_tcf       =       sfq_bind,
 892         .unbind_tcf     =       sfq_put,
 893         .dump           =       sfq_dump_class,
 894         .dump_stats     =       sfq_dump_class_stats,
 895         .walk           =       sfq_walk,
 896 };
 897
 898 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 899         .cl_ops         =       &sfq_class_ops,
 900         .id             =       "sfq",
 901         .priv_size      =       sizeof(struct sfq_sched_data),
 902         .enqueue        =       sfq_enqueue,
 903         .dequeue        =       sfq_dequeue,
 904         .peek           =       qdisc_peek_dequeued,
 905         .init           =       sfq_init,
 906         .reset          =       sfq_reset,
 907         .destroy        =       sfq_destroy,
 908         .change         =       NULL,
 909         .dump           =       sfq_dump,
 910         .owner          =       THIS_MODULE,
 911 };
 912
 913 static int __init sfq_module_init(void)
 914 {
 915         return register_qdisc(&sfq_qdisc_ops);
 916 }
 917 static void __exit sfq_module_exit(void)
 918 {
 919         unregister_qdisc(&sfq_qdisc_ops);
 920 }
 921 module_init(sfq_module_init)
 922 module_exit(sfq_module_exit)
 923 MODULE_LICENSE("GPL");