net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/dst_metadata.h>
 103 #include <net/pkt_sched.h>
 104 #include <net/checksum.h>
 105 #include <net/xfrm.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/module.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #include <net/iw_handler.h>
 113 #include <asm/current.h>
 114 #include <linux/audit.h>
 115 #include <linux/dmaengine.h>
 116 #include <linux/err.h>
 117 #include <linux/ctype.h>
 118 #include <linux/if_arp.h>
 119 #include <linux/if_vlan.h>
 120 #include <linux/ip.h>
 121 #include <net/ip.h>
 122 #include <net/mpls.h>
 123 #include <linux/ipv6.h>
 124 #include <linux/in.h>
 125 #include <linux/jhash.h>
 126 #include <linux/random.h>
 127 #include <trace/events/napi.h>
 128 #include <trace/events/net.h>
 129 #include <trace/events/skb.h>
 130 #include <linux/pci.h>
 131 #include <linux/inetdevice.h>
 132 #include <linux/cpu_rmap.h>
 133 #include <linux/static_key.h>
 134 #include <linux/hashtable.h>
 135 #include <linux/vmalloc.h>
 136 #include <linux/if_macvlan.h>
 137 #include <linux/errqueue.h>
 138 #include <linux/hrtimer.h>
 139 #include <linux/netfilter_ingress.h>
 140
 141 #include "net-sysfs.h"
 142
 143 /* Instead of increasing this, you should create a hash table. */
 144 #define MAX_GRO_SKBS 8
 145
 146 /* This should be increased if a protocol with a bigger head is added. */
 147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149 static DEFINE_SPINLOCK(ptype_lock);
 150 static DEFINE_SPINLOCK(offload_lock);
 151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 152 struct list_head ptype_all __read_mostly;       /* Taps */
 153 static struct list_head offload_base __read_mostly;
 154
 155 static int netif_rx_internal(struct sk_buff *skb);
 156 static int call_netdevice_notifiers_info(unsigned long val,
 157                                          struct net_device *dev,
 158                                          struct netdev_notifier_info *info);
 159
 160 /*
 161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 162  * semaphore.
 163  *
 164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 165  *
 166  * Writers must hold the rtnl semaphore while they loop through the
 167  * dev_base_head list, and hold dev_base_lock for writing when they do the
 168  * actual updates.  This allows pure readers to access the list even
 169  * while a writer is preparing to update it.
 170  *
 171  * To put it another way, dev_base_lock is held for writing only to
 172  * protect against pure readers; the rtnl semaphore provides the
 173  * protection against other writers.
 174  *
 175  * See, for example usages, register_netdevice() and
 176  * unregister_netdevice(), which must be called with the rtnl
 177  * semaphore held.
 178  */
 179 DEFINE_RWLOCK(dev_base_lock);
 180 EXPORT_SYMBOL(dev_base_lock);
 181
 182 /* protects napi_hash addition/deletion and napi_gen_id */
 183 static DEFINE_SPINLOCK(napi_hash_lock);
 184
 185 static unsigned int napi_gen_id;
 186 static DEFINE_HASHTABLE(napi_hash, 8);
 187
 188 static seqcount_t devnet_rename_seq;
 189
 190 static inline void dev_base_seq_inc(struct net *net)
 191 {
 192         while (++net->dev_base_seq == 0);
 193 }
 194
 195 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 196 {
 197         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 198
 199         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 200 }
 201
 202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 203 {
 204         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 205 }
 206
 207 static inline void rps_lock(struct softnet_data *sd)
 208 {
 209 #ifdef CONFIG_RPS
 210         spin_lock(&sd->input_pkt_queue.lock);
 211 #endif
 212 }
 213
 214 static inline void rps_unlock(struct softnet_data *sd)
 215 {
 216 #ifdef CONFIG_RPS
 217         spin_unlock(&sd->input_pkt_queue.lock);
 218 #endif
 219 }
 220
 221 /* Device list insertion */
 222 static void list_netdevice(struct net_device *dev)
 223 {
 224         struct net *net = dev_net(dev);
 225
 226         ASSERT_RTNL();
 227
 228         write_lock_bh(&dev_base_lock);
 229         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 230         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 231         hlist_add_head_rcu(&dev->index_hlist,
 232                            dev_index_hash(net, dev->ifindex));
 233         write_unlock_bh(&dev_base_lock);
 234
 235         dev_base_seq_inc(net);
 236 }
 237
 238 /* Device list removal
 239  * caller must respect a RCU grace period before freeing/reusing dev
 240  */
 241 static void unlist_netdevice(struct net_device *dev)
 242 {
 243         ASSERT_RTNL();
 244
 245         /* Unlink dev from the device chain */
 246         write_lock_bh(&dev_base_lock);
 247         list_del_rcu(&dev->dev_list);
 248         hlist_del_rcu(&dev->name_hlist);
 249         hlist_del_rcu(&dev->index_hlist);
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(dev_net(dev));
 253 }
 254
 255 /*
 256  *      Our notifier list
 257  */
 258
 259 static RAW_NOTIFIER_HEAD(netdev_chain);
 260
 261 /*
 262  *      Device drivers call our routines to queue packets here. We empty the
 263  *      queue in the local softnet handler.
 264  */
 265
 266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 267 EXPORT_PER_CPU_SYMBOL(softnet_data);
 268
 269 #ifdef CONFIG_LOCKDEP
 270 /*
 271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 272  * according to dev->type
 273  */
 274 static const unsigned short netdev_lock_type[] =
 275         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 276          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 277          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 278          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 279          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 280          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 281          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 282          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 283          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 284          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 285          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 286          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 287          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 288          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 289          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 290
 291 static const char *const netdev_lock_name[] =
 292         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 293          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 294          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 295          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 296          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 297          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 298          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 299          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 300          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 301          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 302          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 303          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 304          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 305          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 306          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 307
 308 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 309 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 310
 311 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 312 {
 313         int i;
 314
 315         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 316                 if (netdev_lock_type[i] == dev_type)
 317                         return i;
 318         /* the last key is used by default */
 319         return ARRAY_SIZE(netdev_lock_type) - 1;
 320 }
 321
 322 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 323                                                  unsigned short dev_type)
 324 {
 325         int i;
 326
 327         i = netdev_lock_pos(dev_type);
 328         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 329                                    netdev_lock_name[i]);
 330 }
 331
 332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 333 {
 334         int i;
 335
 336         i = netdev_lock_pos(dev->type);
 337         lockdep_set_class_and_name(&dev->addr_list_lock,
 338                                    &netdev_addr_lock_key[i],
 339                                    netdev_lock_name[i]);
 340 }
 341 #else
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345 }
 346 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 347 {
 348 }
 349 #endif
 350
 351 /*******************************************************************************
 352
 353                 Protocol management and registration routines
 354
 355 *******************************************************************************/
 356
 357 /*
 358  *      Add a protocol ID to the list. Now that the input handler is
 359  *      smarter we can dispense with all the messy stuff that used to be
 360  *      here.
 361  *
 362  *      BEWARE!!! Protocol handlers, mangling input packets,
 363  *      MUST BE last in hash buckets and checking protocol handlers
 364  *      MUST start from promiscuous ptype_all chain in net_bh.
 365  *      It is true now, do not change it.
 366  *      Explanation follows: if protocol handler, mangling packet, will
 367  *      be the first on list, it is not able to sense, that packet
 368  *      is cloned and should be copied-on-write, so that it will
 369  *      change it and subsequent readers will get broken packet.
 370  *                                                      --ANK (980803)
 371  */
 372
 373 static inline struct list_head *ptype_head(const struct packet_type *pt)
 374 {
 375         if (pt->type == htons(ETH_P_ALL))
 376                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 377         else
 378                 return pt->dev ? &pt->dev->ptype_specific :
 379                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 380 }
 381
 382 /**
 383  *      dev_add_pack - add packet handler
 384  *      @pt: packet type declaration
 385  *
 386  *      Add a protocol handler to the networking stack. The passed &packet_type
 387  *      is linked into kernel lists and may not be freed until it has been
 388  *      removed from the kernel lists.
 389  *
 390  *      This call does not sleep therefore it can not
 391  *      guarantee all CPU's that are in middle of receiving packets
 392  *      will see the new packet type (until the next received packet).
 393  */
 394
 395 void dev_add_pack(struct packet_type *pt)
 396 {
 397         struct list_head *head = ptype_head(pt);
 398
 399         spin_lock(&ptype_lock);
 400         list_add_rcu(&pt->list, head);
 401         spin_unlock(&ptype_lock);
 402 }
 403 EXPORT_SYMBOL(dev_add_pack);
 404
 405 /**
 406  *      __dev_remove_pack        - remove packet handler
 407  *      @pt: packet type declaration
 408  *
 409  *      Remove a protocol handler that was previously added to the kernel
 410  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 411  *      from the kernel lists and can be freed or reused once this function
 412  *      returns.
 413  *
 414  *      The packet type might still be in use by receivers
 415  *      and must not be freed until after all the CPU's have gone
 416  *      through a quiescent state.
 417  */
 418 void __dev_remove_pack(struct packet_type *pt)
 419 {
 420         struct list_head *head = ptype_head(pt);
 421         struct packet_type *pt1;
 422
 423         spin_lock(&ptype_lock);
 424
 425         list_for_each_entry(pt1, head, list) {
 426                 if (pt == pt1) {
 427                         list_del_rcu(&pt->list);
 428                         goto out;
 429                 }
 430         }
 431
 432         pr_warn("dev_remove_pack: %p not found\n", pt);
 433 out:
 434         spin_unlock(&ptype_lock);
 435 }
 436 EXPORT_SYMBOL(__dev_remove_pack);
 437
 438 /**
 439  *      dev_remove_pack  - remove packet handler
 440  *      @pt: packet type declaration
 441  *
 442  *      Remove a protocol handler that was previously added to the kernel
 443  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 444  *      from the kernel lists and can be freed or reused once this function
 445  *      returns.
 446  *
 447  *      This call sleeps to guarantee that no CPU is looking at the packet
 448  *      type after return.
 449  */
 450 void dev_remove_pack(struct packet_type *pt)
 451 {
 452         __dev_remove_pack(pt);
 453
 454         synchronize_net();
 455 }
 456 EXPORT_SYMBOL(dev_remove_pack);
 457
 458
 459 /**
 460  *      dev_add_offload - register offload handlers
 461  *      @po: protocol offload declaration
 462  *
 463  *      Add protocol offload handlers to the networking stack. The passed
 464  *      &proto_offload is linked into kernel lists and may not be freed until
 465  *      it has been removed from the kernel lists.
 466  *
 467  *      This call does not sleep therefore it can not
 468  *      guarantee all CPU's that are in middle of receiving packets
 469  *      will see the new offload handlers (until the next received packet).
 470  */
 471 void dev_add_offload(struct packet_offload *po)
 472 {
 473         struct packet_offload *elem;
 474
 475         spin_lock(&offload_lock);
 476         list_for_each_entry(elem, &offload_base, list) {
 477                 if (po->priority < elem->priority)
 478                         break;
 479         }
 480         list_add_rcu(&po->list, elem->list.prev);
 481         spin_unlock(&offload_lock);
 482 }
 483 EXPORT_SYMBOL(dev_add_offload);
 484
 485 /**
 486  *      __dev_remove_offload     - remove offload handler
 487  *      @po: packet offload declaration
 488  *
 489  *      Remove a protocol offload handler that was previously added to the
 490  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 491  *      is removed from the kernel lists and can be freed or reused once this
 492  *      function returns.
 493  *
 494  *      The packet type might still be in use by receivers
 495  *      and must not be freed until after all the CPU's have gone
 496  *      through a quiescent state.
 497  */
 498 static void __dev_remove_offload(struct packet_offload *po)
 499 {
 500         struct list_head *head = &offload_base;
 501         struct packet_offload *po1;
 502
 503         spin_lock(&offload_lock);
 504
 505         list_for_each_entry(po1, head, list) {
 506                 if (po == po1) {
 507                         list_del_rcu(&po->list);
 508                         goto out;
 509                 }
 510         }
 511
 512         pr_warn("dev_remove_offload: %p not found\n", po);
 513 out:
 514         spin_unlock(&offload_lock);
 515 }
 516
 517 /**
 518  *      dev_remove_offload       - remove packet offload handler
 519  *      @po: packet offload declaration
 520  *
 521  *      Remove a packet offload handler that was previously added to the kernel
 522  *      offload handlers by dev_add_offload(). The passed &offload_type is
 523  *      removed from the kernel lists and can be freed or reused once this
 524  *      function returns.
 525  *
 526  *      This call sleeps to guarantee that no CPU is looking at the packet
 527  *      type after return.
 528  */
 529 void dev_remove_offload(struct packet_offload *po)
 530 {
 531         __dev_remove_offload(po);
 532
 533         synchronize_net();
 534 }
 535 EXPORT_SYMBOL(dev_remove_offload);
 536
 537 /******************************************************************************
 538
 539                       Device Boot-time Settings Routines
 540
 541 *******************************************************************************/
 542
 543 /* Boot time configuration table */
 544 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 545
 546 /**
 547  *      netdev_boot_setup_add   - add new setup entry
 548  *      @name: name of the device
 549  *      @map: configured settings for the device
 550  *
 551  *      Adds new setup entry to the dev_boot_setup list.  The function
 552  *      returns 0 on error and 1 on success.  This is a generic routine to
 553  *      all netdevices.
 554  */
 555 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 556 {
 557         struct netdev_boot_setup *s;
 558         int i;
 559
 560         s = dev_boot_setup;
 561         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 562                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 563                         memset(s[i].name, 0, sizeof(s[i].name));
 564                         strlcpy(s[i].name, name, IFNAMSIZ);
 565                         memcpy(&s[i].map, map, sizeof(s[i].map));
 566                         break;
 567                 }
 568         }
 569
 570         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 571 }
 572
 573 /**
 574  *      netdev_boot_setup_check - check boot time settings
 575  *      @dev: the netdevice
 576  *
 577  *      Check boot time settings for the device.
 578  *      The found settings are set for the device to be used
 579  *      later in the device probing.
 580  *      Returns 0 if no settings found, 1 if they are.
 581  */
 582 int netdev_boot_setup_check(struct net_device *dev)
 583 {
 584         struct netdev_boot_setup *s = dev_boot_setup;
 585         int i;
 586
 587         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 588                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 589                     !strcmp(dev->name, s[i].name)) {
 590                         dev->irq        = s[i].map.irq;
 591                         dev->base_addr  = s[i].map.base_addr;
 592                         dev->mem_start  = s[i].map.mem_start;
 593                         dev->mem_end    = s[i].map.mem_end;
 594                         return 1;
 595                 }
 596         }
 597         return 0;
 598 }
 599 EXPORT_SYMBOL(netdev_boot_setup_check);
 600
 601
 602 /**
 603  *      netdev_boot_base        - get address from boot time settings
 604  *      @prefix: prefix for network device
 605  *      @unit: id for network device
 606  *
 607  *      Check boot time settings for the base address of device.
 608  *      The found settings are set for the device to be used
 609  *      later in the device probing.
 610  *      Returns 0 if no settings found.
 611  */
 612 unsigned long netdev_boot_base(const char *prefix, int unit)
 613 {
 614         const struct netdev_boot_setup *s = dev_boot_setup;
 615         char name[IFNAMSIZ];
 616         int i;
 617
 618         sprintf(name, "%s%d", prefix, unit);
 619
 620         /*
 621          * If device already registered then return base of 1
 622          * to indicate not to probe for this interface
 623          */
 624         if (__dev_get_by_name(&init_net, name))
 625                 return 1;
 626
 627         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 628                 if (!strcmp(name, s[i].name))
 629                         return s[i].map.base_addr;
 630         return 0;
 631 }
 632
 633 /*
 634  * Saves at boot time configured settings for any netdevice.
 635  */
 636 int __init netdev_boot_setup(char *str)
 637 {
 638         int ints[5];
 639         struct ifmap map;
 640
 641         str = get_options(str, ARRAY_SIZE(ints), ints);
 642         if (!str || !*str)
 643                 return 0;
 644
 645         /* Save settings */
 646         memset(&map, 0, sizeof(map));
 647         if (ints[0] > 0)
 648                 map.irq = ints[1];
 649         if (ints[0] > 1)
 650                 map.base_addr = ints[2];
 651         if (ints[0] > 2)
 652                 map.mem_start = ints[3];
 653         if (ints[0] > 3)
 654                 map.mem_end = ints[4];
 655
 656         /* Add new entry to the list */
 657         return netdev_boot_setup_add(str, &map);
 658 }
 659
 660 __setup("netdev=", netdev_boot_setup);
 661
 662 /*******************************************************************************
 663
 664                             Device Interface Subroutines
 665
 666 *******************************************************************************/
 667
 668 /**
 669  *      dev_get_iflink  - get 'iflink' value of a interface
 670  *      @dev: targeted interface
 671  *
 672  *      Indicates the ifindex the interface is linked to.
 673  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 674  */
 675
 676 int dev_get_iflink(const struct net_device *dev)
 677 {
 678         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 679                 return dev->netdev_ops->ndo_get_iflink(dev);
 680
 681         return dev->ifindex;
 682 }
 683 EXPORT_SYMBOL(dev_get_iflink);
 684
 685 /**
 686  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 687  *      @dev: targeted interface
 688  *      @skb: The packet.
 689  *
 690  *      For better visibility of tunnel traffic OVS needs to retrieve
 691  *      egress tunnel information for a packet. Following API allows
 692  *      user to get this info.
 693  */
 694 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 695 {
 696         struct ip_tunnel_info *info;
 697
 698         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 699                 return -EINVAL;
 700
 701         info = skb_tunnel_info_unclone(skb);
 702         if (!info)
 703                 return -ENOMEM;
 704         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 705                 return -EINVAL;
 706
 707         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 708 }
 709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 710
 711 /**
 712  *      __dev_get_by_name       - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. Must be called under RTNL semaphore
 717  *      or @dev_base_lock. If the name is found a pointer to the device
 718  *      is returned. If the name is not found then %NULL is returned. The
 719  *      reference counters are not incremented so the caller must be
 720  *      careful with locks.
 721  */
 722
 723 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_name_hash(net, name);
 727
 728         hlist_for_each_entry(dev, head, name_hlist)
 729                 if (!strncmp(dev->name, name, IFNAMSIZ))
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(__dev_get_by_name);
 735
 736 /**
 737  *      dev_get_by_name_rcu     - find a device by its name
 738  *      @net: the applicable net namespace
 739  *      @name: name to find
 740  *
 741  *      Find an interface by name.
 742  *      If the name is found a pointer to the device is returned.
 743  *      If the name is not found then %NULL is returned.
 744  *      The reference counters are not incremented so the caller must be
 745  *      careful with locks. The caller must hold RCU lock.
 746  */
 747
 748 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_name_hash(net, name);
 752
 753         hlist_for_each_entry_rcu(dev, head, name_hlist)
 754                 if (!strncmp(dev->name, name, IFNAMSIZ))
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_name_rcu);
 760
 761 /**
 762  *      dev_get_by_name         - find a device by its name
 763  *      @net: the applicable net namespace
 764  *      @name: name to find
 765  *
 766  *      Find an interface by name. This can be called from any
 767  *      context and does its own locking. The returned handle has
 768  *      the usage count incremented and the caller must use dev_put() to
 769  *      release it when it is no longer needed. %NULL is returned if no
 770  *      matching device is found.
 771  */
 772
 773 struct net_device *dev_get_by_name(struct net *net, const char *name)
 774 {
 775         struct net_device *dev;
 776
 777         rcu_read_lock();
 778         dev = dev_get_by_name_rcu(net, name);
 779         if (dev)
 780                 dev_hold(dev);
 781         rcu_read_unlock();
 782         return dev;
 783 }
 784 EXPORT_SYMBOL(dev_get_by_name);
 785
 786 /**
 787  *      __dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold either the RTNL semaphore
 795  *      or @dev_base_lock.
 796  */
 797
 798 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 799 {
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry(dev, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(__dev_get_by_index);
 810
 811 /**
 812  *      dev_get_by_index_rcu - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns %NULL if the device
 817  *      is not found or a pointer to the device. The device has not
 818  *      had its reference counter increased so the caller must be careful
 819  *      about locking. The caller must hold RCU lock.
 820  */
 821
 822 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825         struct hlist_head *head = dev_index_hash(net, ifindex);
 826
 827         hlist_for_each_entry_rcu(dev, head, index_hlist)
 828                 if (dev->ifindex == ifindex)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index_rcu);
 834
 835
 836 /**
 837  *      dev_get_by_index - find a device by its ifindex
 838  *      @net: the applicable net namespace
 839  *      @ifindex: index of device
 840  *
 841  *      Search for an interface by index. Returns NULL if the device
 842  *      is not found or a pointer to the device. The device returned has
 843  *      had a reference added and the pointer is safe until the user calls
 844  *      dev_put to indicate they have finished with it.
 845  */
 846
 847 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 848 {
 849         struct net_device *dev;
 850
 851         rcu_read_lock();
 852         dev = dev_get_by_index_rcu(net, ifindex);
 853         if (dev)
 854                 dev_hold(dev);
 855         rcu_read_unlock();
 856         return dev;
 857 }
 858 EXPORT_SYMBOL(dev_get_by_index);
 859
 860 /**
 861  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 862  *      @net: network namespace
 863  *      @name: a pointer to the buffer where the name will be stored.
 864  *      @ifindex: the ifindex of the interface to get the name from.
 865  *
 866  *      The use of raw_seqcount_begin() and cond_resched() before
 867  *      retrying is required as we want to give the writers a chance
 868  *      to complete when CONFIG_PREEMPT is not set.
 869  */
 870 int netdev_get_name(struct net *net, char *name, int ifindex)
 871 {
 872         struct net_device *dev;
 873         unsigned int seq;
 874
 875 retry:
 876         seq = raw_seqcount_begin(&devnet_rename_seq);
 877         rcu_read_lock();
 878         dev = dev_get_by_index_rcu(net, ifindex);
 879         if (!dev) {
 880                 rcu_read_unlock();
 881                 return -ENODEV;
 882         }
 883
 884         strcpy(name, dev->name);
 885         rcu_read_unlock();
 886         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 887                 cond_resched();
 888                 goto retry;
 889         }
 890
 891         return 0;
 892 }
 893
 894 /**
 895  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 896  *      @net: the applicable net namespace
 897  *      @type: media type of device
 898  *      @ha: hardware address
 899  *
 900  *      Search for an interface by MAC address. Returns NULL if the device
 901  *      is not found or a pointer to the device.
 902  *      The caller must hold RCU or RTNL.
 903  *      The returned device has not had its ref count increased
 904  *      and the caller must therefore be careful about locking
 905  *
 906  */
 907
 908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 909                                        const char *ha)
 910 {
 911         struct net_device *dev;
 912
 913         for_each_netdev_rcu(net, dev)
 914                 if (dev->type == type &&
 915                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 916                         return dev;
 917
 918         return NULL;
 919 }
 920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 921
 922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 923 {
 924         struct net_device *dev;
 925
 926         ASSERT_RTNL();
 927         for_each_netdev(net, dev)
 928                 if (dev->type == type)
 929                         return dev;
 930
 931         return NULL;
 932 }
 933 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 934
 935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 936 {
 937         struct net_device *dev, *ret = NULL;
 938
 939         rcu_read_lock();
 940         for_each_netdev_rcu(net, dev)
 941                 if (dev->type == type) {
 942                         dev_hold(dev);
 943                         ret = dev;
 944                         break;
 945                 }
 946         rcu_read_unlock();
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 950
 951 /**
 952  *      __dev_get_by_flags - find any device with given flags
 953  *      @net: the applicable net namespace
 954  *      @if_flags: IFF_* values
 955  *      @mask: bitmask of bits in if_flags to check
 956  *
 957  *      Search for any interface with the given flags. Returns NULL if a device
 958  *      is not found or a pointer to the device. Must be called inside
 959  *      rtnl_lock(), and result refcount is unchanged.
 960  */
 961
 962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 963                                       unsigned short mask)
 964 {
 965         struct net_device *dev, *ret;
 966
 967         ASSERT_RTNL();
 968
 969         ret = NULL;
 970         for_each_netdev(net, dev) {
 971                 if (((dev->flags ^ if_flags) & mask) == 0) {
 972                         ret = dev;
 973                         break;
 974                 }
 975         }
 976         return ret;
 977 }
 978 EXPORT_SYMBOL(__dev_get_by_flags);
 979
 980 /**
 981  *      dev_valid_name - check if name is okay for network device
 982  *      @name: name string
 983  *
 984  *      Network device names need to be valid file names to
 985  *      to allow sysfs to work.  We also disallow any kind of
 986  *      whitespace.
 987  */
 988 bool dev_valid_name(const char *name)
 989 {
 990         if (*name == '\0')
 991                 return false;
 992         if (strlen(name) >= IFNAMSIZ)
 993                 return false;
 994         if (!strcmp(name, ".") || !strcmp(name, ".."))
 995                 return false;
 996
 997         while (*name) {
 998                 if (*name == '/' || *name == ':' || isspace(*name))
 999                         return false;
1000                 name++;
1001         }
1002         return true;
1003 }
1004 EXPORT_SYMBOL(dev_valid_name);
1005
1006 /**
1007  *      __dev_alloc_name - allocate a name for a device
1008  *      @net: network namespace to allocate the device name in
1009  *      @name: name format string
1010  *      @buf:  scratch buffer and result name string
1011  *
1012  *      Passed a format string - eg "lt%d" it will try and find a suitable
1013  *      id. It scans list of devices to build up a free map, then chooses
1014  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1015  *      while allocating the name and adding the device in order to avoid
1016  *      duplicates.
1017  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018  *      Returns the number of the unit assigned or a negative errno code.
1019  */
1020
1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1022 {
1023         int i = 0;
1024         const char *p;
1025         const int max_netdevices = 8*PAGE_SIZE;
1026         unsigned long *inuse;
1027         struct net_device *d;
1028
1029         p = strnchr(name, IFNAMSIZ-1, '%');
1030         if (p) {
1031                 /*
1032                  * Verify the string as this thing may have come from
1033                  * the user.  There must be either one "%d" and no other "%"
1034                  * characters.
1035                  */
1036                 if (p[1] != 'd' || strchr(p + 2, '%'))
1037                         return -EINVAL;
1038
1039                 /* Use one page as a bit array of possible slots */
1040                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1041                 if (!inuse)
1042                         return -ENOMEM;
1043
1044                 for_each_netdev(net, d) {
1045                         if (!sscanf(d->name, name, &i))
1046                                 continue;
1047                         if (i < 0 || i >= max_netdevices)
1048                                 continue;
1049
1050                         /*  avoid cases where sscanf is not exact inverse of printf */
1051                         snprintf(buf, IFNAMSIZ, name, i);
1052                         if (!strncmp(buf, d->name, IFNAMSIZ))
1053                                 set_bit(i, inuse);
1054                 }
1055
1056                 i = find_first_zero_bit(inuse, max_netdevices);
1057                 free_page((unsigned long) inuse);
1058         }
1059
1060         if (buf != name)
1061                 snprintf(buf, IFNAMSIZ, name, i);
1062         if (!__dev_get_by_name(net, buf))
1063                 return i;
1064
1065         /* It is possible to run out of possible slots
1066          * when the name is long and there isn't enough space left
1067          * for the digits, or if all bits are used.
1068          */
1069         return -ENFILE;
1070 }
1071
1072 /**
1073  *      dev_alloc_name - allocate a name for a device
1074  *      @dev: device
1075  *      @name: name format string
1076  *
1077  *      Passed a format string - eg "lt%d" it will try and find a suitable
1078  *      id. It scans list of devices to build up a free map, then chooses
1079  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1080  *      while allocating the name and adding the device in order to avoid
1081  *      duplicates.
1082  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083  *      Returns the number of the unit assigned or a negative errno code.
1084  */
1085
1086 int dev_alloc_name(struct net_device *dev, const char *name)
1087 {
1088         char buf[IFNAMSIZ];
1089         struct net *net;
1090         int ret;
1091
1092         BUG_ON(!dev_net(dev));
1093         net = dev_net(dev);
1094         ret = __dev_alloc_name(net, name, buf);
1095         if (ret >= 0)
1096                 strlcpy(dev->name, buf, IFNAMSIZ);
1097         return ret;
1098 }
1099 EXPORT_SYMBOL(dev_alloc_name);
1100
1101 static int dev_alloc_name_ns(struct net *net,
1102                              struct net_device *dev,
1103                              const char *name)
1104 {
1105         char buf[IFNAMSIZ];
1106         int ret;
1107
1108         ret = __dev_alloc_name(net, name, buf);
1109         if (ret >= 0)
1110                 strlcpy(dev->name, buf, IFNAMSIZ);
1111         return ret;
1112 }
1113
1114 static int dev_get_valid_name(struct net *net,
1115                               struct net_device *dev,
1116                               const char *name)
1117 {
1118         BUG_ON(!net);
1119
1120         if (!dev_valid_name(name))
1121                 return -EINVAL;
1122
1123         if (strchr(name, '%'))
1124                 return dev_alloc_name_ns(net, dev, name);
1125         else if (__dev_get_by_name(net, name))
1126                 return -EEXIST;
1127         else if (dev->name != name)
1128                 strlcpy(dev->name, name, IFNAMSIZ);
1129
1130         return 0;
1131 }
1132
1133 /**
1134  *      dev_change_name - change name of a device
1135  *      @dev: device
1136  *      @newname: name (or format string) must be at least IFNAMSIZ
1137  *
1138  *      Change name of a device, can pass format strings "eth%d".
1139  *      for wildcarding.
1140  */
1141 int dev_change_name(struct net_device *dev, const char *newname)
1142 {
1143         unsigned char old_assign_type;
1144         char oldname[IFNAMSIZ];
1145         int err = 0;
1146         int ret;
1147         struct net *net;
1148
1149         ASSERT_RTNL();
1150         BUG_ON(!dev_net(dev));
1151
1152         net = dev_net(dev);
1153         if (dev->flags & IFF_UP)
1154                 return -EBUSY;
1155
1156         write_seqcount_begin(&devnet_rename_seq);
1157
1158         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1159                 write_seqcount_end(&devnet_rename_seq);
1160                 return 0;
1161         }
1162
1163         memcpy(oldname, dev->name, IFNAMSIZ);
1164
1165         err = dev_get_valid_name(net, dev, newname);
1166         if (err < 0) {
1167                 write_seqcount_end(&devnet_rename_seq);
1168                 return err;
1169         }
1170
1171         if (oldname[0] && !strchr(oldname, '%'))
1172                 netdev_info(dev, "renamed from %s\n", oldname);
1173
1174         old_assign_type = dev->name_assign_type;
1175         dev->name_assign_type = NET_NAME_RENAMED;
1176
1177 rollback:
1178         ret = device_rename(&dev->dev, dev->name);
1179         if (ret) {
1180                 memcpy(dev->name, oldname, IFNAMSIZ);
1181                 dev->name_assign_type = old_assign_type;
1182                 write_seqcount_end(&devnet_rename_seq);
1183                 return ret;
1184         }
1185
1186         write_seqcount_end(&devnet_rename_seq);
1187
1188         netdev_adjacent_rename_links(dev, oldname);
1189
1190         write_lock_bh(&dev_base_lock);
1191         hlist_del_rcu(&dev->name_hlist);
1192         write_unlock_bh(&dev_base_lock);
1193
1194         synchronize_rcu();
1195
1196         write_lock_bh(&dev_base_lock);
1197         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198         write_unlock_bh(&dev_base_lock);
1199
1200         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201         ret = notifier_to_errno(ret);
1202
1203         if (ret) {
1204                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205                 if (err >= 0) {
1206                         err = ret;
1207                         write_seqcount_begin(&devnet_rename_seq);
1208                         memcpy(dev->name, oldname, IFNAMSIZ);
1209                         memcpy(oldname, newname, IFNAMSIZ);
1210                         dev->name_assign_type = old_assign_type;
1211                         old_assign_type = NET_NAME_RENAMED;
1212                         goto rollback;
1213                 } else {
1214                         pr_err("%s: name change rollback failed: %d\n",
1215                                dev->name, ret);
1216                 }
1217         }
1218
1219         return err;
1220 }
1221
1222 /**
1223  *      dev_set_alias - change ifalias of a device
1224  *      @dev: device
1225  *      @alias: name up to IFALIASZ
1226  *      @len: limit of bytes to copy from info
1227  *
1228  *      Set ifalias for a device,
1229  */
1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1231 {
1232         char *new_ifalias;
1233
1234         ASSERT_RTNL();
1235
1236         if (len >= IFALIASZ)
1237                 return -EINVAL;
1238
1239         if (!len) {
1240                 kfree(dev->ifalias);
1241                 dev->ifalias = NULL;
1242                 return 0;
1243         }
1244
1245         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1246         if (!new_ifalias)
1247                 return -ENOMEM;
1248         dev->ifalias = new_ifalias;
1249
1250         strlcpy(dev->ifalias, alias, len+1);
1251         return len;
1252 }
1253
1254
1255 /**
1256  *      netdev_features_change - device changes features
1257  *      @dev: device to cause notification
1258  *
1259  *      Called to indicate a device has changed features.
1260  */
1261 void netdev_features_change(struct net_device *dev)
1262 {
1263         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1264 }
1265 EXPORT_SYMBOL(netdev_features_change);
1266
1267 /**
1268  *      netdev_state_change - device changes state
1269  *      @dev: device to cause notification
1270  *
1271  *      Called to indicate a device has changed state. This function calls
1272  *      the notifier chains for netdev_chain and sends a NEWLINK message
1273  *      to the routing socket.
1274  */
1275 void netdev_state_change(struct net_device *dev)
1276 {
1277         if (dev->flags & IFF_UP) {
1278                 struct netdev_notifier_change_info change_info;
1279
1280                 change_info.flags_changed = 0;
1281                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1282                                               &change_info.info);
1283                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1284         }
1285 }
1286 EXPORT_SYMBOL(netdev_state_change);
1287
1288 /**
1289  *      netdev_notify_peers - notify network peers about existence of @dev
1290  *      @dev: network device
1291  *
1292  * Generate traffic such that interested network peers are aware of
1293  * @dev, such as by generating a gratuitous ARP. This may be used when
1294  * a device wants to inform the rest of the network about some sort of
1295  * reconfiguration such as a failover event or virtual machine
1296  * migration.
1297  */
1298 void netdev_notify_peers(struct net_device *dev)
1299 {
1300         rtnl_lock();
1301         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1302         rtnl_unlock();
1303 }
1304 EXPORT_SYMBOL(netdev_notify_peers);
1305
1306 static int __dev_open(struct net_device *dev)
1307 {
1308         const struct net_device_ops *ops = dev->netdev_ops;
1309         int ret;
1310
1311         ASSERT_RTNL();
1312
1313         if (!netif_device_present(dev))
1314                 return -ENODEV;
1315
1316         /* Block netpoll from trying to do any rx path servicing.
1317          * If we don't do this there is a chance ndo_poll_controller
1318          * or ndo_poll may be running while we open the device
1319          */
1320         netpoll_poll_disable(dev);
1321
1322         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1323         ret = notifier_to_errno(ret);
1324         if (ret)
1325                 return ret;
1326
1327         set_bit(__LINK_STATE_START, &dev->state);
1328
1329         if (ops->ndo_validate_addr)
1330                 ret = ops->ndo_validate_addr(dev);
1331
1332         if (!ret && ops->ndo_open)
1333                 ret = ops->ndo_open(dev);
1334
1335         netpoll_poll_enable(dev);
1336
1337         if (ret)
1338                 clear_bit(__LINK_STATE_START, &dev->state);
1339         else {
1340                 dev->flags |= IFF_UP;
1341                 dev_set_rx_mode(dev);
1342                 dev_activate(dev);
1343                 add_device_randomness(dev->dev_addr, dev->addr_len);
1344         }
1345
1346         return ret;
1347 }
1348
1349 /**
1350  *      dev_open        - prepare an interface for use.
1351  *      @dev:   device to open
1352  *
1353  *      Takes a device from down to up state. The device's private open
1354  *      function is invoked and then the multicast lists are loaded. Finally
1355  *      the device is moved into the up state and a %NETDEV_UP message is
1356  *      sent to the netdev notifier chain.
1357  *
1358  *      Calling this function on an active interface is a nop. On a failure
1359  *      a negative errno code is returned.
1360  */
1361 int dev_open(struct net_device *dev)
1362 {
1363         int ret;
1364
1365         if (dev->flags & IFF_UP)
1366                 return 0;
1367
1368         ret = __dev_open(dev);
1369         if (ret < 0)
1370                 return ret;
1371
1372         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1373         call_netdevice_notifiers(NETDEV_UP, dev);
1374
1375         return ret;
1376 }
1377 EXPORT_SYMBOL(dev_open);
1378
1379 static int __dev_close_many(struct list_head *head)
1380 {
1381         struct net_device *dev;
1382
1383         ASSERT_RTNL();
1384         might_sleep();
1385
1386         list_for_each_entry(dev, head, close_list) {
1387                 /* Temporarily disable netpoll until the interface is down */
1388                 netpoll_poll_disable(dev);
1389
1390                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1391
1392                 clear_bit(__LINK_STATE_START, &dev->state);
1393
1394                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1395                  * can be even on different cpu. So just clear netif_running().
1396                  *
1397                  * dev->stop() will invoke napi_disable() on all of it's
1398                  * napi_struct instances on this device.
1399                  */
1400                 smp_mb__after_atomic(); /* Commit netif_running(). */
1401         }
1402
1403         dev_deactivate_many(head);
1404
1405         list_for_each_entry(dev, head, close_list) {
1406                 const struct net_device_ops *ops = dev->netdev_ops;
1407
1408                 /*
1409                  *      Call the device specific close. This cannot fail.
1410                  *      Only if device is UP
1411                  *
1412                  *      We allow it to be called even after a DETACH hot-plug
1413                  *      event.
1414                  */
1415                 if (ops->ndo_stop)
1416                         ops->ndo_stop(dev);
1417
1418                 dev->flags &= ~IFF_UP;
1419                 netpoll_poll_enable(dev);
1420         }
1421
1422         return 0;
1423 }
1424
1425 static int __dev_close(struct net_device *dev)
1426 {
1427         int retval;
1428         LIST_HEAD(single);
1429
1430         list_add(&dev->close_list, &single);
1431         retval = __dev_close_many(&single);
1432         list_del(&single);
1433
1434         return retval;
1435 }
1436
1437 int dev_close_many(struct list_head *head, bool unlink)
1438 {
1439         struct net_device *dev, *tmp;
1440
1441         /* Remove the devices that don't need to be closed */
1442         list_for_each_entry_safe(dev, tmp, head, close_list)
1443                 if (!(dev->flags & IFF_UP))
1444                         list_del_init(&dev->close_list);
1445
1446         __dev_close_many(head);
1447
1448         list_for_each_entry_safe(dev, tmp, head, close_list) {
1449                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1450                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1451                 if (unlink)
1452                         list_del_init(&dev->close_list);
1453         }
1454
1455         return 0;
1456 }
1457 EXPORT_SYMBOL(dev_close_many);
1458
1459 /**
1460  *      dev_close - shutdown an interface.
1461  *      @dev: device to shutdown
1462  *
1463  *      This function moves an active device into down state. A
1464  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1465  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1466  *      chain.
1467  */
1468 int dev_close(struct net_device *dev)
1469 {
1470         if (dev->flags & IFF_UP) {
1471                 LIST_HEAD(single);
1472
1473                 list_add(&dev->close_list, &single);
1474                 dev_close_many(&single, true);
1475                 list_del(&single);
1476         }
1477         return 0;
1478 }
1479 EXPORT_SYMBOL(dev_close);
1480
1481
1482 /**
1483  *      dev_disable_lro - disable Large Receive Offload on a device
1484  *      @dev: device
1485  *
1486  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1487  *      called under RTNL.  This is needed if received packets may be
1488  *      forwarded to another interface.
1489  */
1490 void dev_disable_lro(struct net_device *dev)
1491 {
1492         struct net_device *lower_dev;
1493         struct list_head *iter;
1494
1495         dev->wanted_features &= ~NETIF_F_LRO;
1496         netdev_update_features(dev);
1497
1498         if (unlikely(dev->features & NETIF_F_LRO))
1499                 netdev_WARN(dev, "failed to disable LRO!\n");
1500
1501         netdev_for_each_lower_dev(dev, lower_dev, iter)
1502                 dev_disable_lro(lower_dev);
1503 }
1504 EXPORT_SYMBOL(dev_disable_lro);
1505
1506 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1507                                    struct net_device *dev)
1508 {
1509         struct netdev_notifier_info info;
1510
1511         netdev_notifier_info_init(&info, dev);
1512         return nb->notifier_call(nb, val, &info);
1513 }
1514
1515 static int dev_boot_phase = 1;
1516
1517 /**
1518  *      register_netdevice_notifier - register a network notifier block
1519  *      @nb: notifier
1520  *
1521  *      Register a notifier to be called when network device events occur.
1522  *      The notifier passed is linked into the kernel structures and must
1523  *      not be reused until it has been unregistered. A negative errno code
1524  *      is returned on a failure.
1525  *
1526  *      When registered all registration and up events are replayed
1527  *      to the new notifier to allow device to have a race free
1528  *      view of the network device list.
1529  */
1530
1531 int register_netdevice_notifier(struct notifier_block *nb)
1532 {
1533         struct net_device *dev;
1534         struct net_device *last;
1535         struct net *net;
1536         int err;
1537
1538         rtnl_lock();
1539         err = raw_notifier_chain_register(&netdev_chain, nb);
1540         if (err)
1541                 goto unlock;
1542         if (dev_boot_phase)
1543                 goto unlock;
1544         for_each_net(net) {
1545                 for_each_netdev(net, dev) {
1546                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1547                         err = notifier_to_errno(err);
1548                         if (err)
1549                                 goto rollback;
1550
1551                         if (!(dev->flags & IFF_UP))
1552                                 continue;
1553
1554                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1555                 }
1556         }
1557
1558 unlock:
1559         rtnl_unlock();
1560         return err;
1561
1562 rollback:
1563         last = dev;
1564         for_each_net(net) {
1565                 for_each_netdev(net, dev) {
1566                         if (dev == last)
1567                                 goto outroll;
1568
1569                         if (dev->flags & IFF_UP) {
1570                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1571                                                         dev);
1572                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1573                         }
1574                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1575                 }
1576         }
1577
1578 outroll:
1579         raw_notifier_chain_unregister(&netdev_chain, nb);
1580         goto unlock;
1581 }
1582 EXPORT_SYMBOL(register_netdevice_notifier);
1583
1584 /**
1585  *      unregister_netdevice_notifier - unregister a network notifier block
1586  *      @nb: notifier
1587  *
1588  *      Unregister a notifier previously registered by
1589  *      register_netdevice_notifier(). The notifier is unlinked into the
1590  *      kernel structures and may then be reused. A negative errno code
1591  *      is returned on a failure.
1592  *
1593  *      After unregistering unregister and down device events are synthesized
1594  *      for all devices on the device list to the removed notifier to remove
1595  *      the need for special case cleanup code.
1596  */
1597
1598 int unregister_netdevice_notifier(struct notifier_block *nb)
1599 {
1600         struct net_device *dev;
1601         struct net *net;
1602         int err;
1603
1604         rtnl_lock();
1605         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1606         if (err)
1607                 goto unlock;
1608
1609         for_each_net(net) {
1610                 for_each_netdev(net, dev) {
1611                         if (dev->flags & IFF_UP) {
1612                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1613                                                         dev);
1614                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1615                         }
1616                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1617                 }
1618         }
1619 unlock:
1620         rtnl_unlock();
1621         return err;
1622 }
1623 EXPORT_SYMBOL(unregister_netdevice_notifier);
1624
1625 /**
1626  *      call_netdevice_notifiers_info - call all network notifier blocks
1627  *      @val: value passed unmodified to notifier function
1628  *      @dev: net_device pointer passed unmodified to notifier function
1629  *      @info: notifier information data
1630  *
1631  *      Call all network notifier blocks.  Parameters and return value
1632  *      are as for raw_notifier_call_chain().
1633  */
1634
1635 static int call_netdevice_notifiers_info(unsigned long val,
1636                                          struct net_device *dev,
1637                                          struct netdev_notifier_info *info)
1638 {
1639         ASSERT_RTNL();
1640         netdev_notifier_info_init(info, dev);
1641         return raw_notifier_call_chain(&netdev_chain, val, info);
1642 }
1643
1644 /**
1645  *      call_netdevice_notifiers - call all network notifier blocks
1646  *      @val: value passed unmodified to notifier function
1647  *      @dev: net_device pointer passed unmodified to notifier function
1648  *
1649  *      Call all network notifier blocks.  Parameters and return value
1650  *      are as for raw_notifier_call_chain().
1651  */
1652
1653 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1654 {
1655         struct netdev_notifier_info info;
1656
1657         return call_netdevice_notifiers_info(val, dev, &info);
1658 }
1659 EXPORT_SYMBOL(call_netdevice_notifiers);
1660
1661 #ifdef CONFIG_NET_INGRESS
1662 static struct static_key ingress_needed __read_mostly;
1663
1664 void net_inc_ingress_queue(void)
1665 {
1666         static_key_slow_inc(&ingress_needed);
1667 }
1668 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1669
1670 void net_dec_ingress_queue(void)
1671 {
1672         static_key_slow_dec(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1675 #endif
1676
1677 static struct static_key netstamp_needed __read_mostly;
1678 #ifdef HAVE_JUMP_LABEL
1679 /* We are not allowed to call static_key_slow_dec() from irq context
1680  * If net_disable_timestamp() is called from irq context, defer the
1681  * static_key_slow_dec() calls.
1682  */
1683 static atomic_t netstamp_needed_deferred;
1684 #endif
1685
1686 void net_enable_timestamp(void)
1687 {
1688 #ifdef HAVE_JUMP_LABEL
1689         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1690
1691         if (deferred) {
1692                 while (--deferred)
1693                         static_key_slow_dec(&netstamp_needed);
1694                 return;
1695         }
1696 #endif
1697         static_key_slow_inc(&netstamp_needed);
1698 }
1699 EXPORT_SYMBOL(net_enable_timestamp);
1700
1701 void net_disable_timestamp(void)
1702 {
1703 #ifdef HAVE_JUMP_LABEL
1704         if (in_interrupt()) {
1705                 atomic_inc(&netstamp_needed_deferred);
1706                 return;
1707         }
1708 #endif
1709         static_key_slow_dec(&netstamp_needed);
1710 }
1711 EXPORT_SYMBOL(net_disable_timestamp);
1712
1713 static inline void net_timestamp_set(struct sk_buff *skb)
1714 {
1715         skb->tstamp.tv64 = 0;
1716         if (static_key_false(&netstamp_needed))
1717                 __net_timestamp(skb);
1718 }
1719
1720 #define net_timestamp_check(COND, SKB)                  \
1721         if (static_key_false(&netstamp_needed)) {               \
1722                 if ((COND) && !(SKB)->tstamp.tv64)      \
1723                         __net_timestamp(SKB);           \
1724         }                                               \
1725
1726 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1727 {
1728         unsigned int len;
1729
1730         if (!(dev->flags & IFF_UP))
1731                 return false;
1732
1733         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1734         if (skb->len <= len)
1735                 return true;
1736
1737         /* if TSO is enabled, we don't care about the length as the packet
1738          * could be forwarded without being segmented before
1739          */
1740         if (skb_is_gso(skb))
1741                 return true;
1742
1743         return false;
1744 }
1745 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1746
1747 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1748 {
1749         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1750             unlikely(!is_skb_forwardable(dev, skb))) {
1751                 atomic_long_inc(&dev->rx_dropped);
1752                 kfree_skb(skb);
1753                 return NET_RX_DROP;
1754         }
1755
1756         skb_scrub_packet(skb, true);
1757         skb->priority = 0;
1758         skb->protocol = eth_type_trans(skb, dev);
1759         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1760
1761         return 0;
1762 }
1763 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1764
1765 /**
1766  * dev_forward_skb - loopback an skb to another netif
1767  *
1768  * @dev: destination network device
1769  * @skb: buffer to forward
1770  *
1771  * return values:
1772  *      NET_RX_SUCCESS  (no congestion)
1773  *      NET_RX_DROP     (packet was dropped, but freed)
1774  *
1775  * dev_forward_skb can be used for injecting an skb from the
1776  * start_xmit function of one device into the receive queue
1777  * of another device.
1778  *
1779  * The receiving device may be in another namespace, so
1780  * we have to clear all information in the skb that could
1781  * impact namespace isolation.
1782  */
1783 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1784 {
1785         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1786 }
1787 EXPORT_SYMBOL_GPL(dev_forward_skb);
1788
1789 static inline int deliver_skb(struct sk_buff *skb,
1790                               struct packet_type *pt_prev,
1791                               struct net_device *orig_dev)
1792 {
1793         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1794                 return -ENOMEM;
1795         atomic_inc(&skb->users);
1796         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1797 }
1798
1799 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1800                                           struct packet_type **pt,
1801                                           struct net_device *orig_dev,
1802                                           __be16 type,
1803                                           struct list_head *ptype_list)
1804 {
1805         struct packet_type *ptype, *pt_prev = *pt;
1806
1807         list_for_each_entry_rcu(ptype, ptype_list, list) {
1808                 if (ptype->type != type)
1809                         continue;
1810                 if (pt_prev)
1811                         deliver_skb(skb, pt_prev, orig_dev);
1812                 pt_prev = ptype;
1813         }
1814         *pt = pt_prev;
1815 }
1816
1817 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1818 {
1819         if (!ptype->af_packet_priv || !skb->sk)
1820                 return false;
1821
1822         if (ptype->id_match)
1823                 return ptype->id_match(ptype, skb->sk);
1824         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1825                 return true;
1826
1827         return false;
1828 }
1829
1830 /*
1831  *      Support routine. Sends outgoing frames to any network
1832  *      taps currently in use.
1833  */
1834
1835 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1836 {
1837         struct packet_type *ptype;
1838         struct sk_buff *skb2 = NULL;
1839         struct packet_type *pt_prev = NULL;
1840         struct list_head *ptype_list = &ptype_all;
1841
1842         rcu_read_lock();
1843 again:
1844         list_for_each_entry_rcu(ptype, ptype_list, list) {
1845                 /* Never send packets back to the socket
1846                  * they originated from - MvS (miquels@drinkel.ow.org)
1847                  */
1848                 if (skb_loop_sk(ptype, skb))
1849                         continue;
1850
1851                 if (pt_prev) {
1852                         deliver_skb(skb2, pt_prev, skb->dev);
1853                         pt_prev = ptype;
1854                         continue;
1855                 }
1856
1857                 /* need to clone skb, done only once */
1858                 skb2 = skb_clone(skb, GFP_ATOMIC);
1859                 if (!skb2)
1860                         goto out_unlock;
1861
1862                 net_timestamp_set(skb2);
1863
1864                 /* skb->nh should be correctly
1865                  * set by sender, so that the second statement is
1866                  * just protection against buggy protocols.
1867                  */
1868                 skb_reset_mac_header(skb2);
1869
1870                 if (skb_network_header(skb2) < skb2->data ||
1871                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1872                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1873                                              ntohs(skb2->protocol),
1874                                              dev->name);
1875                         skb_reset_network_header(skb2);
1876                 }
1877
1878                 skb2->transport_header = skb2->network_header;
1879                 skb2->pkt_type = PACKET_OUTGOING;
1880                 pt_prev = ptype;
1881         }
1882
1883         if (ptype_list == &ptype_all) {
1884                 ptype_list = &dev->ptype_all;
1885                 goto again;
1886         }
1887 out_unlock:
1888         if (pt_prev)
1889                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1890         rcu_read_unlock();
1891 }
1892
1893 /**
1894  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1895  * @dev: Network device
1896  * @txq: number of queues available
1897  *
1898  * If real_num_tx_queues is changed the tc mappings may no longer be
1899  * valid. To resolve this verify the tc mapping remains valid and if
1900  * not NULL the mapping. With no priorities mapping to this
1901  * offset/count pair it will no longer be used. In the worst case TC0
1902  * is invalid nothing can be done so disable priority mappings. If is
1903  * expected that drivers will fix this mapping if they can before
1904  * calling netif_set_real_num_tx_queues.
1905  */
1906 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1907 {
1908         int i;
1909         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1910
1911         /* If TC0 is invalidated disable TC mapping */
1912         if (tc->offset + tc->count > txq) {
1913                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1914                 dev->num_tc = 0;
1915                 return;
1916         }
1917
1918         /* Invalidated prio to tc mappings set to TC0 */
1919         for (i = 1; i < TC_BITMASK + 1; i++) {
1920                 int q = netdev_get_prio_tc_map(dev, i);
1921
1922                 tc = &dev->tc_to_txq[q];
1923                 if (tc->offset + tc->count > txq) {
1924                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1925                                 i, q);
1926                         netdev_set_prio_tc_map(dev, i, 0);
1927                 }
1928         }
1929 }
1930
1931 #ifdef CONFIG_XPS
1932 static DEFINE_MUTEX(xps_map_mutex);
1933 #define xmap_dereference(P)             \
1934         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1935
1936 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1937                                         int cpu, u16 index)
1938 {
1939         struct xps_map *map = NULL;
1940         int pos;
1941
1942         if (dev_maps)
1943                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1944
1945         for (pos = 0; map && pos < map->len; pos++) {
1946                 if (map->queues[pos] == index) {
1947                         if (map->len > 1) {
1948                                 map->queues[pos] = map->queues[--map->len];
1949                         } else {
1950                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1951                                 kfree_rcu(map, rcu);
1952                                 map = NULL;
1953                         }
1954                         break;
1955                 }
1956         }
1957
1958         return map;
1959 }
1960
1961 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1962 {
1963         struct xps_dev_maps *dev_maps;
1964         int cpu, i;
1965         bool active = false;
1966
1967         mutex_lock(&xps_map_mutex);
1968         dev_maps = xmap_dereference(dev->xps_maps);
1969
1970         if (!dev_maps)
1971                 goto out_no_maps;
1972
1973         for_each_possible_cpu(cpu) {
1974                 for (i = index; i < dev->num_tx_queues; i++) {
1975                         if (!remove_xps_queue(dev_maps, cpu, i))
1976                                 break;
1977                 }
1978                 if (i == dev->num_tx_queues)
1979                         active = true;
1980         }
1981
1982         if (!active) {
1983                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1984                 kfree_rcu(dev_maps, rcu);
1985         }
1986
1987         for (i = index; i < dev->num_tx_queues; i++)
1988                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1989                                              NUMA_NO_NODE);
1990
1991 out_no_maps:
1992         mutex_unlock(&xps_map_mutex);
1993 }
1994
1995 static struct xps_map *expand_xps_map(struct xps_map *map,
1996                                       int cpu, u16 index)
1997 {
1998         struct xps_map *new_map;
1999         int alloc_len = XPS_MIN_MAP_ALLOC;
2000         int i, pos;
2001
2002         for (pos = 0; map && pos < map->len; pos++) {
2003                 if (map->queues[pos] != index)
2004                         continue;
2005                 return map;
2006         }
2007
2008         /* Need to add queue to this CPU's existing map */
2009         if (map) {
2010                 if (pos < map->alloc_len)
2011                         return map;
2012
2013                 alloc_len = map->alloc_len * 2;
2014         }
2015
2016         /* Need to allocate new map to store queue on this CPU's map */
2017         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2018                                cpu_to_node(cpu));
2019         if (!new_map)
2020                 return NULL;
2021
2022         for (i = 0; i < pos; i++)
2023                 new_map->queues[i] = map->queues[i];
2024         new_map->alloc_len = alloc_len;
2025         new_map->len = pos;
2026
2027         return new_map;
2028 }
2029
2030 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2031                         u16 index)
2032 {
2033         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2034         struct xps_map *map, *new_map;
2035         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2036         int cpu, numa_node_id = -2;
2037         bool active = false;
2038
2039         mutex_lock(&xps_map_mutex);
2040
2041         dev_maps = xmap_dereference(dev->xps_maps);
2042
2043         /* allocate memory for queue storage */
2044         for_each_online_cpu(cpu) {
2045                 if (!cpumask_test_cpu(cpu, mask))
2046                         continue;
2047
2048                 if (!new_dev_maps)
2049                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2050                 if (!new_dev_maps) {
2051                         mutex_unlock(&xps_map_mutex);
2052                         return -ENOMEM;
2053                 }
2054
2055                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2056                                  NULL;
2057
2058                 map = expand_xps_map(map, cpu, index);
2059                 if (!map)
2060                         goto error;
2061
2062                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2063         }
2064
2065         if (!new_dev_maps)
2066                 goto out_no_new_maps;
2067
2068         for_each_possible_cpu(cpu) {
2069                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2070                         /* add queue to CPU maps */
2071                         int pos = 0;
2072
2073                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2074                         while ((pos < map->len) && (map->queues[pos] != index))
2075                                 pos++;
2076
2077                         if (pos == map->len)
2078                                 map->queues[map->len++] = index;
2079 #ifdef CONFIG_NUMA
2080                         if (numa_node_id == -2)
2081                                 numa_node_id = cpu_to_node(cpu);
2082                         else if (numa_node_id != cpu_to_node(cpu))
2083                                 numa_node_id = -1;
2084 #endif
2085                 } else if (dev_maps) {
2086                         /* fill in the new device map from the old device map */
2087                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2088                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2089                 }
2090
2091         }
2092
2093         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2094
2095         /* Cleanup old maps */
2096         if (dev_maps) {
2097                 for_each_possible_cpu(cpu) {
2098                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2099                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2100                         if (map && map != new_map)
2101                                 kfree_rcu(map, rcu);
2102                 }
2103
2104                 kfree_rcu(dev_maps, rcu);
2105         }
2106
2107         dev_maps = new_dev_maps;
2108         active = true;
2109
2110 out_no_new_maps:
2111         /* update Tx queue numa node */
2112         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2113                                      (numa_node_id >= 0) ? numa_node_id :
2114                                      NUMA_NO_NODE);
2115
2116         if (!dev_maps)
2117                 goto out_no_maps;
2118
2119         /* removes queue from unused CPUs */
2120         for_each_possible_cpu(cpu) {
2121                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2122                         continue;
2123
2124                 if (remove_xps_queue(dev_maps, cpu, index))
2125                         active = true;
2126         }
2127
2128         /* free map if not active */
2129         if (!active) {
2130                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2131                 kfree_rcu(dev_maps, rcu);
2132         }
2133
2134 out_no_maps:
2135         mutex_unlock(&xps_map_mutex);
2136
2137         return 0;
2138 error:
2139         /* remove any maps that we added */
2140         for_each_possible_cpu(cpu) {
2141                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2142                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2143                                  NULL;
2144                 if (new_map && new_map != map)
2145                         kfree(new_map);
2146         }
2147
2148         mutex_unlock(&xps_map_mutex);
2149
2150         kfree(new_dev_maps);
2151         return -ENOMEM;
2152 }
2153 EXPORT_SYMBOL(netif_set_xps_queue);
2154
2155 #endif
2156 /*
2157  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2158  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2159  */
2160 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2161 {
2162         int rc;
2163
2164         if (txq < 1 || txq > dev->num_tx_queues)
2165                 return -EINVAL;
2166
2167         if (dev->reg_state == NETREG_REGISTERED ||
2168             dev->reg_state == NETREG_UNREGISTERING) {
2169                 ASSERT_RTNL();
2170
2171                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2172                                                   txq);
2173                 if (rc)
2174                         return rc;
2175
2176                 if (dev->num_tc)
2177                         netif_setup_tc(dev, txq);
2178
2179                 if (txq < dev->real_num_tx_queues) {
2180                         qdisc_reset_all_tx_gt(dev, txq);
2181 #ifdef CONFIG_XPS
2182                         netif_reset_xps_queues_gt(dev, txq);
2183 #endif
2184                 }
2185         }
2186
2187         dev->real_num_tx_queues = txq;
2188         return 0;
2189 }
2190 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2191
2192 #ifdef CONFIG_SYSFS
2193 /**
2194  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2195  *      @dev: Network device
2196  *      @rxq: Actual number of RX queues
2197  *
2198  *      This must be called either with the rtnl_lock held or before
2199  *      registration of the net device.  Returns 0 on success, or a
2200  *      negative error code.  If called before registration, it always
2201  *      succeeds.
2202  */
2203 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2204 {
2205         int rc;
2206
2207         if (rxq < 1 || rxq > dev->num_rx_queues)
2208                 return -EINVAL;
2209
2210         if (dev->reg_state == NETREG_REGISTERED) {
2211                 ASSERT_RTNL();
2212
2213                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2214                                                   rxq);
2215                 if (rc)
2216                         return rc;
2217         }
2218
2219         dev->real_num_rx_queues = rxq;
2220         return 0;
2221 }
2222 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2223 #endif
2224
2225 /**
2226  * netif_get_num_default_rss_queues - default number of RSS queues
2227  *
2228  * This routine should set an upper limit on the number of RSS queues
2229  * used by default by multiqueue devices.
2230  */
2231 int netif_get_num_default_rss_queues(void)
2232 {
2233         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2234 }
2235 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2236
2237 static inline void __netif_reschedule(struct Qdisc *q)
2238 {
2239         struct softnet_data *sd;
2240         unsigned long flags;
2241
2242         local_irq_save(flags);
2243         sd = this_cpu_ptr(&softnet_data);
2244         q->next_sched = NULL;
2245         *sd->output_queue_tailp = q;
2246         sd->output_queue_tailp = &q->next_sched;
2247         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2248         local_irq_restore(flags);
2249 }
2250
2251 void __netif_schedule(struct Qdisc *q)
2252 {
2253         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2254                 __netif_reschedule(q);
2255 }
2256 EXPORT_SYMBOL(__netif_schedule);
2257
2258 struct dev_kfree_skb_cb {
2259         enum skb_free_reason reason;
2260 };
2261
2262 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2263 {
2264         return (struct dev_kfree_skb_cb *)skb->cb;
2265 }
2266
2267 void netif_schedule_queue(struct netdev_queue *txq)
2268 {
2269         rcu_read_lock();
2270         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2271                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2272
2273                 __netif_schedule(q);
2274         }
2275         rcu_read_unlock();
2276 }
2277 EXPORT_SYMBOL(netif_schedule_queue);
2278
2279 /**
2280  *      netif_wake_subqueue - allow sending packets on subqueue
2281  *      @dev: network device
2282  *      @queue_index: sub queue index
2283  *
2284  * Resume individual transmit queue of a device with multiple transmit queues.
2285  */
2286 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2287 {
2288         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2289
2290         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2291                 struct Qdisc *q;
2292
2293                 rcu_read_lock();
2294                 q = rcu_dereference(txq->qdisc);
2295                 __netif_schedule(q);
2296                 rcu_read_unlock();
2297         }
2298 }
2299 EXPORT_SYMBOL(netif_wake_subqueue);
2300
2301 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2302 {
2303         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2304                 struct Qdisc *q;
2305
2306                 rcu_read_lock();
2307                 q = rcu_dereference(dev_queue->qdisc);
2308                 __netif_schedule(q);
2309                 rcu_read_unlock();
2310         }
2311 }
2312 EXPORT_SYMBOL(netif_tx_wake_queue);
2313
2314 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2315 {
2316         unsigned long flags;
2317
2318         if (likely(atomic_read(&skb->users) == 1)) {
2319                 smp_rmb();
2320                 atomic_set(&skb->users, 0);
2321         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2322                 return;
2323         }
2324         get_kfree_skb_cb(skb)->reason = reason;
2325         local_irq_save(flags);
2326         skb->next = __this_cpu_read(softnet_data.completion_queue);
2327         __this_cpu_write(softnet_data.completion_queue, skb);
2328         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2329         local_irq_restore(flags);
2330 }
2331 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2332
2333 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2334 {
2335         if (in_irq() || irqs_disabled())
2336                 __dev_kfree_skb_irq(skb, reason);
2337         else
2338                 dev_kfree_skb(skb);
2339 }
2340 EXPORT_SYMBOL(__dev_kfree_skb_any);
2341
2342
2343 /**
2344  * netif_device_detach - mark device as removed
2345  * @dev: network device
2346  *
2347  * Mark device as removed from system and therefore no longer available.
2348  */
2349 void netif_device_detach(struct net_device *dev)
2350 {
2351         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2352             netif_running(dev)) {
2353                 netif_tx_stop_all_queues(dev);
2354         }
2355 }
2356 EXPORT_SYMBOL(netif_device_detach);
2357
2358 /**
2359  * netif_device_attach - mark device as attached
2360  * @dev: network device
2361  *
2362  * Mark device as attached from system and restart if needed.
2363  */
2364 void netif_device_attach(struct net_device *dev)
2365 {
2366         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2367             netif_running(dev)) {
2368                 netif_tx_wake_all_queues(dev);
2369                 __netdev_watchdog_up(dev);
2370         }
2371 }
2372 EXPORT_SYMBOL(netif_device_attach);
2373
2374 /*
2375  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2376  * to be used as a distribution range.
2377  */
2378 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2379                   unsigned int num_tx_queues)
2380 {
2381         u32 hash;
2382         u16 qoffset = 0;
2383         u16 qcount = num_tx_queues;
2384
2385         if (skb_rx_queue_recorded(skb)) {
2386                 hash = skb_get_rx_queue(skb);
2387                 while (unlikely(hash >= num_tx_queues))
2388                         hash -= num_tx_queues;
2389                 return hash;
2390         }
2391
2392         if (dev->num_tc) {
2393                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2394                 qoffset = dev->tc_to_txq[tc].offset;
2395                 qcount = dev->tc_to_txq[tc].count;
2396         }
2397
2398         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2399 }
2400 EXPORT_SYMBOL(__skb_tx_hash);
2401
2402 static void skb_warn_bad_offload(const struct sk_buff *skb)
2403 {
2404         static const netdev_features_t null_features = 0;
2405         struct net_device *dev = skb->dev;
2406         const char *driver = "";
2407
2408         if (!net_ratelimit())
2409                 return;
2410
2411         if (dev && dev->dev.parent)
2412                 driver = dev_driver_string(dev->dev.parent);
2413
2414         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2415              "gso_type=%d ip_summed=%d\n",
2416              driver, dev ? &dev->features : &null_features,
2417              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2418              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2419              skb_shinfo(skb)->gso_type, skb->ip_summed);
2420 }
2421
2422 /*
2423  * Invalidate hardware checksum when packet is to be mangled, and
2424  * complete checksum manually on outgoing path.
2425  */
2426 int skb_checksum_help(struct sk_buff *skb)
2427 {
2428         __wsum csum;
2429         int ret = 0, offset;
2430
2431         if (skb->ip_summed == CHECKSUM_COMPLETE)
2432                 goto out_set_summed;
2433
2434         if (unlikely(skb_shinfo(skb)->gso_size)) {
2435                 skb_warn_bad_offload(skb);
2436                 return -EINVAL;
2437         }
2438
2439         /* Before computing a checksum, we should make sure no frag could
2440          * be modified by an external entity : checksum could be wrong.
2441          */
2442         if (skb_has_shared_frag(skb)) {
2443                 ret = __skb_linearize(skb);
2444                 if (ret)
2445                         goto out;
2446         }
2447
2448         offset = skb_checksum_start_offset(skb);
2449         BUG_ON(offset >= skb_headlen(skb));
2450         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2451
2452         offset += skb->csum_offset;
2453         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2454
2455         if (skb_cloned(skb) &&
2456             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2457                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2458                 if (ret)
2459                         goto out;
2460         }
2461
2462         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2463 out_set_summed:
2464         skb->ip_summed = CHECKSUM_NONE;
2465 out:
2466         return ret;
2467 }
2468 EXPORT_SYMBOL(skb_checksum_help);
2469
2470 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2471 {
2472         __be16 type = skb->protocol;
2473
2474         /* Tunnel gso handlers can set protocol to ethernet. */
2475         if (type == htons(ETH_P_TEB)) {
2476                 struct ethhdr *eth;
2477
2478                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2479                         return 0;
2480
2481                 eth = (struct ethhdr *)skb_mac_header(skb);
2482                 type = eth->h_proto;
2483         }
2484
2485         return __vlan_get_protocol(skb, type, depth);
2486 }
2487
2488 /**
2489  *      skb_mac_gso_segment - mac layer segmentation handler.
2490  *      @skb: buffer to segment
2491  *      @features: features for the output path (see dev->features)
2492  */
2493 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2494                                     netdev_features_t features)
2495 {
2496         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2497         struct packet_offload *ptype;
2498         int vlan_depth = skb->mac_len;
2499         __be16 type = skb_network_protocol(skb, &vlan_depth);
2500
2501         if (unlikely(!type))
2502                 return ERR_PTR(-EINVAL);
2503
2504         __skb_pull(skb, vlan_depth);
2505
2506         rcu_read_lock();
2507         list_for_each_entry_rcu(ptype, &offload_base, list) {
2508                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2509                         segs = ptype->callbacks.gso_segment(skb, features);
2510                         break;
2511                 }
2512         }
2513         rcu_read_unlock();
2514
2515         __skb_push(skb, skb->data - skb_mac_header(skb));
2516
2517         return segs;
2518 }
2519 EXPORT_SYMBOL(skb_mac_gso_segment);
2520
2521
2522 /* openvswitch calls this on rx path, so we need a different check.
2523  */
2524 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2525 {
2526         if (tx_path)
2527                 return skb->ip_summed != CHECKSUM_PARTIAL;
2528         else
2529                 return skb->ip_summed == CHECKSUM_NONE;
2530 }
2531
2532 /**
2533  *      __skb_gso_segment - Perform segmentation on skb.
2534  *      @skb: buffer to segment
2535  *      @features: features for the output path (see dev->features)
2536  *      @tx_path: whether it is called in TX path
2537  *
2538  *      This function segments the given skb and returns a list of segments.
2539  *
2540  *      It may return NULL if the skb requires no segmentation.  This is
2541  *      only possible when GSO is used for verifying header integrity.
2542  */
2543 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2544                                   netdev_features_t features, bool tx_path)
2545 {
2546         if (unlikely(skb_needs_check(skb, tx_path))) {
2547                 int err;
2548
2549                 skb_warn_bad_offload(skb);
2550
2551                 err = skb_cow_head(skb, 0);
2552                 if (err < 0)
2553                         return ERR_PTR(err);
2554         }
2555
2556         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2557         SKB_GSO_CB(skb)->encap_level = 0;
2558
2559         skb_reset_mac_header(skb);
2560         skb_reset_mac_len(skb);
2561
2562         return skb_mac_gso_segment(skb, features);
2563 }
2564 EXPORT_SYMBOL(__skb_gso_segment);
2565
2566 /* Take action when hardware reception checksum errors are detected. */
2567 #ifdef CONFIG_BUG
2568 void netdev_rx_csum_fault(struct net_device *dev)
2569 {
2570         if (net_ratelimit()) {
2571                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2572                 dump_stack();
2573         }
2574 }
2575 EXPORT_SYMBOL(netdev_rx_csum_fault);
2576 #endif
2577
2578 /* Actually, we should eliminate this check as soon as we know, that:
2579  * 1. IOMMU is present and allows to map all the memory.
2580  * 2. No high memory really exists on this machine.
2581  */
2582
2583 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2584 {
2585 #ifdef CONFIG_HIGHMEM
2586         int i;
2587         if (!(dev->features & NETIF_F_HIGHDMA)) {
2588                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2589                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2590                         if (PageHighMem(skb_frag_page(frag)))
2591                                 return 1;
2592                 }
2593         }
2594
2595         if (PCI_DMA_BUS_IS_PHYS) {
2596                 struct device *pdev = dev->dev.parent;
2597
2598                 if (!pdev)
2599                         return 0;
2600                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2601                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2602                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2603                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2604                                 return 1;
2605                 }
2606         }
2607 #endif
2608         return 0;
2609 }
2610
2611 /* If MPLS offload request, verify we are testing hardware MPLS features
2612  * instead of standard features for the netdev.
2613  */
2614 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2615 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2616                                            netdev_features_t features,
2617                                            __be16 type)
2618 {
2619         if (eth_p_mpls(type))
2620                 features &= skb->dev->mpls_features;
2621
2622         return features;
2623 }
2624 #else
2625 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2626                                            netdev_features_t features,
2627                                            __be16 type)
2628 {
2629         return features;
2630 }
2631 #endif
2632
2633 static netdev_features_t harmonize_features(struct sk_buff *skb,
2634         netdev_features_t features)
2635 {
2636         int tmp;
2637         __be16 type;
2638
2639         type = skb_network_protocol(skb, &tmp);
2640         features = net_mpls_features(skb, features, type);
2641
2642         if (skb->ip_summed != CHECKSUM_NONE &&
2643             !can_checksum_protocol(features, type)) {
2644                 features &= ~NETIF_F_ALL_CSUM;
2645         } else if (illegal_highdma(skb->dev, skb)) {
2646                 features &= ~NETIF_F_SG;
2647         }
2648
2649         return features;
2650 }
2651
2652 netdev_features_t passthru_features_check(struct sk_buff *skb,
2653                                           struct net_device *dev,
2654                                           netdev_features_t features)
2655 {
2656         return features;
2657 }
2658 EXPORT_SYMBOL(passthru_features_check);
2659
2660 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2661                                              struct net_device *dev,
2662                                              netdev_features_t features)
2663 {
2664         return vlan_features_check(skb, features);
2665 }
2666
2667 netdev_features_t netif_skb_features(struct sk_buff *skb)
2668 {
2669         struct net_device *dev = skb->dev;
2670         netdev_features_t features = dev->features;
2671         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2672
2673         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2674                 features &= ~NETIF_F_GSO_MASK;
2675
2676         /* If encapsulation offload request, verify we are testing
2677          * hardware encapsulation features instead of standard
2678          * features for the netdev
2679          */
2680         if (skb->encapsulation)
2681                 features &= dev->hw_enc_features;
2682
2683         if (skb_vlan_tagged(skb))
2684                 features = netdev_intersect_features(features,
2685                                                      dev->vlan_features |
2686                                                      NETIF_F_HW_VLAN_CTAG_TX |
2687                                                      NETIF_F_HW_VLAN_STAG_TX);
2688
2689         if (dev->netdev_ops->ndo_features_check)
2690                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2691                                                                 features);
2692         else
2693                 features &= dflt_features_check(skb, dev, features);
2694
2695         return harmonize_features(skb, features);
2696 }
2697 EXPORT_SYMBOL(netif_skb_features);
2698
2699 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2700                     struct netdev_queue *txq, bool more)
2701 {
2702         unsigned int len;
2703         int rc;
2704
2705         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2706                 dev_queue_xmit_nit(skb, dev);
2707
2708         len = skb->len;
2709         trace_net_dev_start_xmit(skb, dev);
2710         rc = netdev_start_xmit(skb, dev, txq, more);
2711         trace_net_dev_xmit(skb, rc, dev, len);
2712
2713         return rc;
2714 }
2715
2716 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2717                                     struct netdev_queue *txq, int *ret)
2718 {
2719         struct sk_buff *skb = first;
2720         int rc = NETDEV_TX_OK;
2721
2722         while (skb) {
2723                 struct sk_buff *next = skb->next;
2724
2725                 skb->next = NULL;
2726                 rc = xmit_one(skb, dev, txq, next != NULL);
2727                 if (unlikely(!dev_xmit_complete(rc))) {
2728                         skb->next = next;
2729                         goto out;
2730                 }
2731
2732                 skb = next;
2733                 if (netif_xmit_stopped(txq) && skb) {
2734                         rc = NETDEV_TX_BUSY;
2735                         break;
2736                 }
2737         }
2738
2739 out:
2740         *ret = rc;
2741         return skb;
2742 }
2743
2744 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2745                                           netdev_features_t features)
2746 {
2747         if (skb_vlan_tag_present(skb) &&
2748             !vlan_hw_offload_capable(features, skb->vlan_proto))
2749                 skb = __vlan_hwaccel_push_inside(skb);
2750         return skb;
2751 }
2752
2753 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2754 {
2755         netdev_features_t features;
2756
2757         if (skb->next)
2758                 return skb;
2759
2760         features = netif_skb_features(skb);
2761         skb = validate_xmit_vlan(skb, features);
2762         if (unlikely(!skb))
2763                 goto out_null;
2764
2765         if (netif_needs_gso(skb, features)) {
2766                 struct sk_buff *segs;
2767
2768                 segs = skb_gso_segment(skb, features);
2769                 if (IS_ERR(segs)) {
2770                         goto out_kfree_skb;
2771                 } else if (segs) {
2772                         consume_skb(skb);
2773                         skb = segs;
2774                 }
2775         } else {
2776                 if (skb_needs_linearize(skb, features) &&
2777                     __skb_linearize(skb))
2778                         goto out_kfree_skb;
2779
2780                 /* If packet is not checksummed and device does not
2781                  * support checksumming for this protocol, complete
2782                  * checksumming here.
2783                  */
2784                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2785                         if (skb->encapsulation)
2786                                 skb_set_inner_transport_header(skb,
2787                                                                skb_checksum_start_offset(skb));
2788                         else
2789                                 skb_set_transport_header(skb,
2790                                                          skb_checksum_start_offset(skb));
2791                         if (!(features & NETIF_F_ALL_CSUM) &&
2792                             skb_checksum_help(skb))
2793                                 goto out_kfree_skb;
2794                 }
2795         }
2796
2797         return skb;
2798
2799 out_kfree_skb:
2800         kfree_skb(skb);
2801 out_null:
2802         return NULL;
2803 }
2804
2805 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2806 {
2807         struct sk_buff *next, *head = NULL, *tail;
2808
2809         for (; skb != NULL; skb = next) {
2810                 next = skb->next;
2811                 skb->next = NULL;
2812
2813                 /* in case skb wont be segmented, point to itself */
2814                 skb->prev = skb;
2815
2816                 skb = validate_xmit_skb(skb, dev);
2817                 if (!skb)
2818                         continue;
2819
2820                 if (!head)
2821                         head = skb;
2822                 else
2823                         tail->next = skb;
2824                 /* If skb was segmented, skb->prev points to
2825                  * the last segment. If not, it still contains skb.
2826                  */
2827                 tail = skb->prev;
2828         }
2829         return head;
2830 }
2831
2832 static void qdisc_pkt_len_init(struct sk_buff *skb)
2833 {
2834         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2835
2836         qdisc_skb_cb(skb)->pkt_len = skb->len;
2837
2838         /* To get more precise estimation of bytes sent on wire,
2839          * we add to pkt_len the headers size of all segments
2840          */
2841         if (shinfo->gso_size)  {
2842                 unsigned int hdr_len;
2843                 u16 gso_segs = shinfo->gso_segs;
2844
2845                 /* mac layer + network layer */
2846                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2847
2848                 /* + transport layer */
2849                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2850                         hdr_len += tcp_hdrlen(skb);
2851                 else
2852                         hdr_len += sizeof(struct udphdr);
2853
2854                 if (shinfo->gso_type & SKB_GSO_DODGY)
2855                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2856                                                 shinfo->gso_size);
2857
2858                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2859         }
2860 }
2861
2862 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2863                                  struct net_device *dev,
2864                                  struct netdev_queue *txq)
2865 {
2866         spinlock_t *root_lock = qdisc_lock(q);
2867         bool contended;
2868         int rc;
2869
2870         qdisc_pkt_len_init(skb);
2871         qdisc_calculate_pkt_len(skb, q);
2872         /*
2873          * Heuristic to force contended enqueues to serialize on a
2874          * separate lock before trying to get qdisc main lock.
2875          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2876          * often and dequeue packets faster.
2877          */
2878         contended = qdisc_is_running(q);
2879         if (unlikely(contended))
2880                 spin_lock(&q->busylock);
2881
2882         spin_lock(root_lock);
2883         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2884                 kfree_skb(skb);
2885                 rc = NET_XMIT_DROP;
2886         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2887                    qdisc_run_begin(q)) {
2888                 /*
2889                  * This is a work-conserving queue; there are no old skbs
2890                  * waiting to be sent out; and the qdisc is not running -
2891                  * xmit the skb directly.
2892                  */
2893
2894                 qdisc_bstats_update(q, skb);
2895
2896                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2897                         if (unlikely(contended)) {
2898                                 spin_unlock(&q->busylock);
2899                                 contended = false;
2900                         }
2901                         __qdisc_run(q);
2902                 } else
2903                         qdisc_run_end(q);
2904
2905                 rc = NET_XMIT_SUCCESS;
2906         } else {
2907                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2908                 if (qdisc_run_begin(q)) {
2909                         if (unlikely(contended)) {
2910                                 spin_unlock(&q->busylock);
2911                                 contended = false;
2912                         }
2913                         __qdisc_run(q);
2914                 }
2915         }
2916         spin_unlock(root_lock);
2917         if (unlikely(contended))
2918                 spin_unlock(&q->busylock);
2919         return rc;
2920 }
2921
2922 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2923 static void skb_update_prio(struct sk_buff *skb)
2924 {
2925         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2926
2927         if (!skb->priority && skb->sk && map) {
2928                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2929
2930                 if (prioidx < map->priomap_len)
2931                         skb->priority = map->priomap[prioidx];
2932         }
2933 }
2934 #else
2935 #define skb_update_prio(skb)
2936 #endif
2937
2938 DEFINE_PER_CPU(int, xmit_recursion);
2939 EXPORT_SYMBOL(xmit_recursion);
2940
2941 #define RECURSION_LIMIT 10
2942
2943 /**
2944  *      dev_loopback_xmit - loop back @skb
2945  *      @net: network namespace this loopback is happening in
2946  *      @sk:  sk needed to be a netfilter okfn
2947  *      @skb: buffer to transmit
2948  */
2949 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
2950 {
2951         skb_reset_mac_header(skb);
2952         __skb_pull(skb, skb_network_offset(skb));
2953         skb->pkt_type = PACKET_LOOPBACK;
2954         skb->ip_summed = CHECKSUM_UNNECESSARY;
2955         WARN_ON(!skb_dst(skb));
2956         skb_dst_force(skb);
2957         netif_rx_ni(skb);
2958         return 0;
2959 }
2960 EXPORT_SYMBOL(dev_loopback_xmit);
2961
2962 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2963 {
2964 #ifdef CONFIG_XPS
2965         struct xps_dev_maps *dev_maps;
2966         struct xps_map *map;
2967         int queue_index = -1;
2968
2969         rcu_read_lock();
2970         dev_maps = rcu_dereference(dev->xps_maps);
2971         if (dev_maps) {
2972                 map = rcu_dereference(
2973                     dev_maps->cpu_map[skb->sender_cpu - 1]);
2974                 if (map) {
2975                         if (map->len == 1)
2976                                 queue_index = map->queues[0];
2977                         else
2978                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2979                                                                            map->len)];
2980                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2981                                 queue_index = -1;
2982                 }
2983         }
2984         rcu_read_unlock();
2985
2986         return queue_index;
2987 #else
2988         return -1;
2989 #endif
2990 }
2991
2992 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2993 {
2994         struct sock *sk = skb->sk;
2995         int queue_index = sk_tx_queue_get(sk);
2996
2997         if (queue_index < 0 || skb->ooo_okay ||
2998             queue_index >= dev->real_num_tx_queues) {
2999                 int new_index = get_xps_queue(dev, skb);
3000                 if (new_index < 0)
3001                         new_index = skb_tx_hash(dev, skb);
3002
3003                 if (queue_index != new_index && sk &&
3004                     sk_fullsock(sk) &&
3005                     rcu_access_pointer(sk->sk_dst_cache))
3006                         sk_tx_queue_set(sk, new_index);
3007
3008                 queue_index = new_index;
3009         }
3010
3011         return queue_index;
3012 }
3013
3014 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3015                                     struct sk_buff *skb,
3016                                     void *accel_priv)
3017 {
3018         int queue_index = 0;
3019
3020 #ifdef CONFIG_XPS
3021         if (skb->sender_cpu == 0)
3022                 skb->sender_cpu = raw_smp_processor_id() + 1;
3023 #endif
3024
3025         if (dev->real_num_tx_queues != 1) {
3026                 const struct net_device_ops *ops = dev->netdev_ops;
3027                 if (ops->ndo_select_queue)
3028                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3029                                                             __netdev_pick_tx);
3030                 else
3031                         queue_index = __netdev_pick_tx(dev, skb);
3032
3033                 if (!accel_priv)
3034                         queue_index = netdev_cap_txqueue(dev, queue_index);
3035         }
3036
3037         skb_set_queue_mapping(skb, queue_index);
3038         return netdev_get_tx_queue(dev, queue_index);
3039 }
3040
3041 /**
3042  *      __dev_queue_xmit - transmit a buffer
3043  *      @skb: buffer to transmit
3044  *      @accel_priv: private data used for L2 forwarding offload
3045  *
3046  *      Queue a buffer for transmission to a network device. The caller must
3047  *      have set the device and priority and built the buffer before calling
3048  *      this function. The function can be called from an interrupt.
3049  *
3050  *      A negative errno code is returned on a failure. A success does not
3051  *      guarantee the frame will be transmitted as it may be dropped due
3052  *      to congestion or traffic shaping.
3053  *
3054  * -----------------------------------------------------------------------------------
3055  *      I notice this method can also return errors from the queue disciplines,
3056  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3057  *      be positive.
3058  *
3059  *      Regardless of the return value, the skb is consumed, so it is currently
3060  *      difficult to retry a send to this method.  (You can bump the ref count
3061  *      before sending to hold a reference for retry if you are careful.)
3062  *
3063  *      When calling this method, interrupts MUST be enabled.  This is because
3064  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3065  *          --BLG
3066  */
3067 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3068 {
3069         struct net_device *dev = skb->dev;
3070         struct netdev_queue *txq;
3071         struct Qdisc *q;
3072         int rc = -ENOMEM;
3073
3074         skb_reset_mac_header(skb);
3075
3076         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3077                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3078
3079         /* Disable soft irqs for various locks below. Also
3080          * stops preemption for RCU.
3081          */
3082         rcu_read_lock_bh();
3083
3084         skb_update_prio(skb);
3085
3086         /* If device/qdisc don't need skb->dst, release it right now while
3087          * its hot in this cpu cache.
3088          */
3089         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3090                 skb_dst_drop(skb);
3091         else
3092                 skb_dst_force(skb);
3093
3094 #ifdef CONFIG_NET_SWITCHDEV
3095         /* Don't forward if offload device already forwarded */
3096         if (skb->offload_fwd_mark &&
3097             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3098                 consume_skb(skb);
3099                 rc = NET_XMIT_SUCCESS;
3100                 goto out;
3101         }
3102 #endif
3103
3104         txq = netdev_pick_tx(dev, skb, accel_priv);
3105         q = rcu_dereference_bh(txq->qdisc);
3106
3107 #ifdef CONFIG_NET_CLS_ACT
3108         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3109 #endif
3110         trace_net_dev_queue(skb);
3111         if (q->enqueue) {
3112                 rc = __dev_xmit_skb(skb, q, dev, txq);
3113                 goto out;
3114         }
3115
3116         /* The device has no queue. Common case for software devices:
3117            loopback, all the sorts of tunnels...
3118
3119            Really, it is unlikely that netif_tx_lock protection is necessary
3120            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3121            counters.)
3122            However, it is possible, that they rely on protection
3123            made by us here.
3124
3125            Check this and shot the lock. It is not prone from deadlocks.
3126            Either shot noqueue qdisc, it is even simpler 8)
3127          */
3128         if (dev->flags & IFF_UP) {
3129                 int cpu = smp_processor_id(); /* ok because BHs are off */
3130
3131                 if (txq->xmit_lock_owner != cpu) {
3132
3133                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3134                                 goto recursion_alert;
3135
3136                         skb = validate_xmit_skb(skb, dev);
3137                         if (!skb)
3138                                 goto drop;
3139
3140                         HARD_TX_LOCK(dev, txq, cpu);
3141
3142                         if (!netif_xmit_stopped(txq)) {
3143                                 __this_cpu_inc(xmit_recursion);
3144                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3145                                 __this_cpu_dec(xmit_recursion);
3146                                 if (dev_xmit_complete(rc)) {
3147                                         HARD_TX_UNLOCK(dev, txq);
3148                                         goto out;
3149                                 }
3150                         }
3151                         HARD_TX_UNLOCK(dev, txq);
3152                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3153                                              dev->name);
3154                 } else {
3155                         /* Recursion is detected! It is possible,
3156                          * unfortunately
3157                          */
3158 recursion_alert:
3159                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3160                                              dev->name);
3161                 }
3162         }
3163
3164         rc = -ENETDOWN;
3165 drop:
3166         rcu_read_unlock_bh();
3167
3168         atomic_long_inc(&dev->tx_dropped);
3169         kfree_skb_list(skb);
3170         return rc;
3171 out:
3172         rcu_read_unlock_bh();
3173         return rc;
3174 }
3175
3176 int dev_queue_xmit(struct sk_buff *skb)
3177 {
3178         return __dev_queue_xmit(skb, NULL);
3179 }
3180 EXPORT_SYMBOL(dev_queue_xmit);
3181
3182 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3183 {
3184         return __dev_queue_xmit(skb, accel_priv);
3185 }
3186 EXPORT_SYMBOL(dev_queue_xmit_accel);
3187
3188
3189 /*=======================================================================
3190                         Receiver routines
3191   =======================================================================*/
3192
3193 int netdev_max_backlog __read_mostly = 1000;
3194 EXPORT_SYMBOL(netdev_max_backlog);
3195
3196 int netdev_tstamp_prequeue __read_mostly = 1;
3197 int netdev_budget __read_mostly = 300;
3198 int weight_p __read_mostly = 64;            /* old backlog weight */
3199
3200 /* Called with irq disabled */
3201 static inline void ____napi_schedule(struct softnet_data *sd,
3202                                      struct napi_struct *napi)
3203 {
3204         list_add_tail(&napi->poll_list, &sd->poll_list);
3205         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3206 }
3207
3208 #ifdef CONFIG_RPS
3209
3210 /* One global table that all flow-based protocols share. */
3211 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3212 EXPORT_SYMBOL(rps_sock_flow_table);
3213 u32 rps_cpu_mask __read_mostly;
3214 EXPORT_SYMBOL(rps_cpu_mask);
3215
3216 struct static_key rps_needed __read_mostly;
3217
3218 static struct rps_dev_flow *
3219 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3220             struct rps_dev_flow *rflow, u16 next_cpu)
3221 {
3222         if (next_cpu < nr_cpu_ids) {
3223 #ifdef CONFIG_RFS_ACCEL
3224                 struct netdev_rx_queue *rxqueue;
3225                 struct rps_dev_flow_table *flow_table;
3226                 struct rps_dev_flow *old_rflow;
3227                 u32 flow_id;
3228                 u16 rxq_index;
3229                 int rc;
3230
3231                 /* Should we steer this flow to a different hardware queue? */
3232                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3233                     !(dev->features & NETIF_F_NTUPLE))
3234                         goto out;
3235                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3236                 if (rxq_index == skb_get_rx_queue(skb))
3237                         goto out;
3238
3239                 rxqueue = dev->_rx + rxq_index;
3240                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3241                 if (!flow_table)
3242                         goto out;
3243                 flow_id = skb_get_hash(skb) & flow_table->mask;
3244                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3245                                                         rxq_index, flow_id);
3246                 if (rc < 0)
3247                         goto out;
3248                 old_rflow = rflow;
3249                 rflow = &flow_table->flows[flow_id];
3250                 rflow->filter = rc;
3251                 if (old_rflow->filter == rflow->filter)
3252                         old_rflow->filter = RPS_NO_FILTER;
3253         out:
3254 #endif
3255                 rflow->last_qtail =
3256                         per_cpu(softnet_data, next_cpu).input_queue_head;
3257         }
3258
3259         rflow->cpu = next_cpu;
3260         return rflow;
3261 }
3262
3263 /*
3264  * get_rps_cpu is called from netif_receive_skb and returns the target
3265  * CPU from the RPS map of the receiving queue for a given skb.
3266  * rcu_read_lock must be held on entry.
3267  */
3268 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3269                        struct rps_dev_flow **rflowp)
3270 {
3271         const struct rps_sock_flow_table *sock_flow_table;
3272         struct netdev_rx_queue *rxqueue = dev->_rx;
3273         struct rps_dev_flow_table *flow_table;
3274         struct rps_map *map;
3275         int cpu = -1;
3276         u32 tcpu;
3277         u32 hash;
3278
3279         if (skb_rx_queue_recorded(skb)) {
3280                 u16 index = skb_get_rx_queue(skb);
3281
3282                 if (unlikely(index >= dev->real_num_rx_queues)) {
3283                         WARN_ONCE(dev->real_num_rx_queues > 1,
3284                                   "%s received packet on queue %u, but number "
3285                                   "of RX queues is %u\n",
3286                                   dev->name, index, dev->real_num_rx_queues);
3287                         goto done;
3288                 }
3289                 rxqueue += index;
3290         }
3291
3292         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3293
3294         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3295         map = rcu_dereference(rxqueue->rps_map);
3296         if (!flow_table && !map)
3297                 goto done;
3298
3299         skb_reset_network_header(skb);
3300         hash = skb_get_hash(skb);
3301         if (!hash)
3302                 goto done;
3303
3304         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3305         if (flow_table && sock_flow_table) {
3306                 struct rps_dev_flow *rflow;
3307                 u32 next_cpu;
3308                 u32 ident;
3309
3310                 /* First check into global flow table if there is a match */
3311                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3312                 if ((ident ^ hash) & ~rps_cpu_mask)
3313                         goto try_rps;
3314
3315                 next_cpu = ident & rps_cpu_mask;
3316
3317                 /* OK, now we know there is a match,
3318                  * we can look at the local (per receive queue) flow table
3319                  */
3320                 rflow = &flow_table->flows[hash & flow_table->mask];
3321                 tcpu = rflow->cpu;
3322
3323                 /*
3324                  * If the desired CPU (where last recvmsg was done) is
3325                  * different from current CPU (one in the rx-queue flow
3326                  * table entry), switch if one of the following holds:
3327                  *   - Current CPU is unset (>= nr_cpu_ids).
3328                  *   - Current CPU is offline.
3329                  *   - The current CPU's queue tail has advanced beyond the
3330                  *     last packet that was enqueued using this table entry.
3331                  *     This guarantees that all previous packets for the flow
3332                  *     have been dequeued, thus preserving in order delivery.
3333                  */
3334                 if (unlikely(tcpu != next_cpu) &&
3335                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3336                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3337                       rflow->last_qtail)) >= 0)) {
3338                         tcpu = next_cpu;
3339                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3340                 }
3341
3342                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3343                         *rflowp = rflow;
3344                         cpu = tcpu;
3345                         goto done;
3346                 }
3347         }
3348
3349 try_rps:
3350
3351         if (map) {
3352                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3353                 if (cpu_online(tcpu)) {
3354                         cpu = tcpu;
3355                         goto done;
3356                 }
3357         }
3358
3359 done:
3360         return cpu;
3361 }
3362
3363 #ifdef CONFIG_RFS_ACCEL
3364
3365 /**
3366  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3367  * @dev: Device on which the filter was set
3368  * @rxq_index: RX queue index
3369  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3370  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3371  *
3372  * Drivers that implement ndo_rx_flow_steer() should periodically call
3373  * this function for each installed filter and remove the filters for
3374  * which it returns %true.
3375  */
3376 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3377                          u32 flow_id, u16 filter_id)
3378 {
3379         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3380         struct rps_dev_flow_table *flow_table;
3381         struct rps_dev_flow *rflow;
3382         bool expire = true;
3383         unsigned int cpu;
3384
3385         rcu_read_lock();
3386         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3387         if (flow_table && flow_id <= flow_table->mask) {
3388                 rflow = &flow_table->flows[flow_id];
3389                 cpu = ACCESS_ONCE(rflow->cpu);
3390                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3391                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3392                            rflow->last_qtail) <
3393                      (int)(10 * flow_table->mask)))
3394                         expire = false;
3395         }
3396         rcu_read_unlock();
3397         return expire;
3398 }
3399 EXPORT_SYMBOL(rps_may_expire_flow);
3400
3401 #endif /* CONFIG_RFS_ACCEL */
3402
3403 /* Called from hardirq (IPI) context */
3404 static void rps_trigger_softirq(void *data)
3405 {
3406         struct softnet_data *sd = data;
3407
3408         ____napi_schedule(sd, &sd->backlog);
3409         sd->received_rps++;
3410 }
3411
3412 #endif /* CONFIG_RPS */
3413
3414 /*
3415  * Check if this softnet_data structure is another cpu one
3416  * If yes, queue it to our IPI list and return 1
3417  * If no, return 0
3418  */
3419 static int rps_ipi_queued(struct softnet_data *sd)
3420 {
3421 #ifdef CONFIG_RPS
3422         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3423
3424         if (sd != mysd) {
3425                 sd->rps_ipi_next = mysd->rps_ipi_list;
3426                 mysd->rps_ipi_list = sd;
3427
3428                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3429                 return 1;
3430         }
3431 #endif /* CONFIG_RPS */
3432         return 0;
3433 }
3434
3435 #ifdef CONFIG_NET_FLOW_LIMIT
3436 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3437 #endif
3438
3439 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3440 {
3441 #ifdef CONFIG_NET_FLOW_LIMIT
3442         struct sd_flow_limit *fl;
3443         struct softnet_data *sd;
3444         unsigned int old_flow, new_flow;
3445
3446         if (qlen < (netdev_max_backlog >> 1))
3447                 return false;
3448
3449         sd = this_cpu_ptr(&softnet_data);
3450
3451         rcu_read_lock();
3452         fl = rcu_dereference(sd->flow_limit);
3453         if (fl) {
3454                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3455                 old_flow = fl->history[fl->history_head];
3456                 fl->history[fl->history_head] = new_flow;
3457
3458                 fl->history_head++;
3459                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3460
3461                 if (likely(fl->buckets[old_flow]))
3462                         fl->buckets[old_flow]--;
3463
3464                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3465                         fl->count++;
3466                         rcu_read_unlock();
3467                         return true;
3468                 }
3469         }
3470         rcu_read_unlock();
3471 #endif
3472         return false;
3473 }
3474
3475 /*
3476  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3477  * queue (may be a remote CPU queue).
3478  */
3479 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3480                               unsigned int *qtail)
3481 {
3482         struct softnet_data *sd;
3483         unsigned long flags;
3484         unsigned int qlen;
3485
3486         sd = &per_cpu(softnet_data, cpu);
3487
3488         local_irq_save(flags);
3489
3490         rps_lock(sd);
3491         if (!netif_running(skb->dev))
3492                 goto drop;
3493         qlen = skb_queue_len(&sd->input_pkt_queue);
3494         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3495                 if (qlen) {
3496 enqueue:
3497                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3498                         input_queue_tail_incr_save(sd, qtail);
3499                         rps_unlock(sd);
3500                         local_irq_restore(flags);
3501                         return NET_RX_SUCCESS;
3502                 }
3503
3504                 /* Schedule NAPI for backlog device
3505                  * We can use non atomic operation since we own the queue lock
3506                  */
3507                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3508                         if (!rps_ipi_queued(sd))
3509                                 ____napi_schedule(sd, &sd->backlog);
3510                 }
3511                 goto enqueue;
3512         }
3513
3514 drop:
3515         sd->dropped++;
3516         rps_unlock(sd);
3517
3518         local_irq_restore(flags);
3519
3520         atomic_long_inc(&skb->dev->rx_dropped);
3521         kfree_skb(skb);
3522         return NET_RX_DROP;
3523 }
3524
3525 static int netif_rx_internal(struct sk_buff *skb)
3526 {
3527         int ret;
3528
3529         net_timestamp_check(netdev_tstamp_prequeue, skb);
3530
3531         trace_netif_rx(skb);
3532 #ifdef CONFIG_RPS
3533         if (static_key_false(&rps_needed)) {
3534                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3535                 int cpu;
3536
3537                 preempt_disable();
3538                 rcu_read_lock();
3539
3540                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3541                 if (cpu < 0)
3542                         cpu = smp_processor_id();
3543
3544                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3545
3546                 rcu_read_unlock();
3547                 preempt_enable();
3548         } else
3549 #endif
3550         {
3551                 unsigned int qtail;
3552                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3553                 put_cpu();
3554         }
3555         return ret;
3556 }
3557
3558 /**
3559  *      netif_rx        -       post buffer to the network code
3560  *      @skb: buffer to post
3561  *
3562  *      This function receives a packet from a device driver and queues it for
3563  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3564  *      may be dropped during processing for congestion control or by the
3565  *      protocol layers.
3566  *
3567  *      return values:
3568  *      NET_RX_SUCCESS  (no congestion)
3569  *      NET_RX_DROP     (packet was dropped)
3570  *
3571  */
3572
3573 int netif_rx(struct sk_buff *skb)
3574 {
3575         trace_netif_rx_entry(skb);
3576
3577         return netif_rx_internal(skb);
3578 }
3579 EXPORT_SYMBOL(netif_rx);
3580
3581 int netif_rx_ni(struct sk_buff *skb)
3582 {
3583         int err;
3584
3585         trace_netif_rx_ni_entry(skb);
3586
3587         preempt_disable();
3588         err = netif_rx_internal(skb);
3589         if (local_softirq_pending())
3590                 do_softirq();
3591         preempt_enable();
3592
3593         return err;
3594 }
3595 EXPORT_SYMBOL(netif_rx_ni);
3596
3597 static void net_tx_action(struct softirq_action *h)
3598 {
3599         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3600
3601         if (sd->completion_queue) {
3602                 struct sk_buff *clist;
3603
3604                 local_irq_disable();
3605                 clist = sd->completion_queue;
3606                 sd->completion_queue = NULL;
3607                 local_irq_enable();
3608
3609                 while (clist) {
3610                         struct sk_buff *skb = clist;
3611                         clist = clist->next;
3612
3613                         WARN_ON(atomic_read(&skb->users));
3614                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3615                                 trace_consume_skb(skb);
3616                         else
3617                                 trace_kfree_skb(skb, net_tx_action);
3618                         __kfree_skb(skb);
3619                 }
3620         }
3621
3622         if (sd->output_queue) {
3623                 struct Qdisc *head;
3624
3625                 local_irq_disable();
3626                 head = sd->output_queue;
3627                 sd->output_queue = NULL;
3628                 sd->output_queue_tailp = &sd->output_queue;
3629                 local_irq_enable();
3630
3631                 while (head) {
3632                         struct Qdisc *q = head;
3633                         spinlock_t *root_lock;
3634
3635                         head = head->next_sched;
3636
3637                         root_lock = qdisc_lock(q);
3638                         if (spin_trylock(root_lock)) {
3639                                 smp_mb__before_atomic();
3640                                 clear_bit(__QDISC_STATE_SCHED,
3641                                           &q->state);
3642                                 qdisc_run(q);
3643                                 spin_unlock(root_lock);
3644                         } else {
3645                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3646                                               &q->state)) {
3647                                         __netif_reschedule(q);
3648                                 } else {
3649                                         smp_mb__before_atomic();
3650                                         clear_bit(__QDISC_STATE_SCHED,
3651                                                   &q->state);
3652                                 }
3653                         }
3654                 }
3655         }
3656 }
3657
3658 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3659     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3660 /* This hook is defined here for ATM LANE */
3661 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3662                              unsigned char *addr) __read_mostly;
3663 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3664 #endif
3665
3666 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3667                                          struct packet_type **pt_prev,
3668                                          int *ret, struct net_device *orig_dev)
3669 {
3670 #ifdef CONFIG_NET_CLS_ACT
3671         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3672         struct tcf_result cl_res;
3673
3674         /* If there's at least one ingress present somewhere (so
3675          * we get here via enabled static key), remaining devices
3676          * that are not configured with an ingress qdisc will bail
3677          * out here.
3678          */
3679         if (!cl)
3680                 return skb;
3681         if (*pt_prev) {
3682                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3683                 *pt_prev = NULL;
3684         }
3685
3686         qdisc_skb_cb(skb)->pkt_len = skb->len;
3687         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3688         qdisc_bstats_cpu_update(cl->q, skb);
3689
3690         switch (tc_classify(skb, cl, &cl_res, false)) {
3691         case TC_ACT_OK:
3692         case TC_ACT_RECLASSIFY:
3693                 skb->tc_index = TC_H_MIN(cl_res.classid);
3694                 break;
3695         case TC_ACT_SHOT:
3696                 qdisc_qstats_cpu_drop(cl->q);
3697         case TC_ACT_STOLEN:
3698         case TC_ACT_QUEUED:
3699                 kfree_skb(skb);
3700                 return NULL;
3701         case TC_ACT_REDIRECT:
3702                 /* skb_mac_header check was done by cls/act_bpf, so
3703                  * we can safely push the L2 header back before
3704                  * redirecting to another netdev
3705                  */
3706                 __skb_push(skb, skb->mac_len);
3707                 skb_do_redirect(skb);
3708                 return NULL;
3709         default:
3710                 break;
3711         }
3712 #endif /* CONFIG_NET_CLS_ACT */
3713         return skb;
3714 }
3715
3716 /**
3717  *      netdev_rx_handler_register - register receive handler
3718  *      @dev: device to register a handler for
3719  *      @rx_handler: receive handler to register
3720  *      @rx_handler_data: data pointer that is used by rx handler
3721  *
3722  *      Register a receive handler for a device. This handler will then be
3723  *      called from __netif_receive_skb. A negative errno code is returned
3724  *      on a failure.
3725  *
3726  *      The caller must hold the rtnl_mutex.
3727  *
3728  *      For a general description of rx_handler, see enum rx_handler_result.
3729  */
3730 int netdev_rx_handler_register(struct net_device *dev,
3731                                rx_handler_func_t *rx_handler,
3732                                void *rx_handler_data)
3733 {
3734         ASSERT_RTNL();
3735
3736         if (dev->rx_handler)
3737                 return -EBUSY;
3738
3739         /* Note: rx_handler_data must be set before rx_handler */
3740         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3741         rcu_assign_pointer(dev->rx_handler, rx_handler);
3742
3743         return 0;
3744 }
3745 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3746
3747 /**
3748  *      netdev_rx_handler_unregister - unregister receive handler
3749  *      @dev: device to unregister a handler from
3750  *
3751  *      Unregister a receive handler from a device.
3752  *
3753  *      The caller must hold the rtnl_mutex.
3754  */
3755 void netdev_rx_handler_unregister(struct net_device *dev)
3756 {
3757
3758         ASSERT_RTNL();
3759         RCU_INIT_POINTER(dev->rx_handler, NULL);
3760         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3761          * section has a guarantee to see a non NULL rx_handler_data
3762          * as well.
3763          */
3764         synchronize_net();
3765         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3766 }
3767 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3768
3769 /*
3770  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3771  * the special handling of PFMEMALLOC skbs.
3772  */
3773 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3774 {
3775         switch (skb->protocol) {
3776         case htons(ETH_P_ARP):
3777         case htons(ETH_P_IP):
3778         case htons(ETH_P_IPV6):
3779         case htons(ETH_P_8021Q):
3780         case htons(ETH_P_8021AD):
3781                 return true;
3782         default:
3783                 return false;
3784         }
3785 }
3786
3787 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3788                              int *ret, struct net_device *orig_dev)
3789 {
3790 #ifdef CONFIG_NETFILTER_INGRESS
3791         if (nf_hook_ingress_active(skb)) {
3792                 if (*pt_prev) {
3793                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3794                         *pt_prev = NULL;
3795                 }
3796
3797                 return nf_hook_ingress(skb);
3798         }
3799 #endif /* CONFIG_NETFILTER_INGRESS */
3800         return 0;
3801 }
3802
3803 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3804 {
3805         struct packet_type *ptype, *pt_prev;
3806         rx_handler_func_t *rx_handler;
3807         struct net_device *orig_dev;
3808         bool deliver_exact = false;
3809         int ret = NET_RX_DROP;
3810         __be16 type;
3811
3812         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3813
3814         trace_netif_receive_skb(skb);
3815
3816         orig_dev = skb->dev;
3817
3818         skb_reset_network_header(skb);
3819         if (!skb_transport_header_was_set(skb))
3820                 skb_reset_transport_header(skb);
3821         skb_reset_mac_len(skb);
3822
3823         pt_prev = NULL;
3824
3825 another_round:
3826         skb->skb_iif = skb->dev->ifindex;
3827
3828         __this_cpu_inc(softnet_data.processed);
3829
3830         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3831             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3832                 skb = skb_vlan_untag(skb);
3833                 if (unlikely(!skb))
3834                         goto out;
3835         }
3836
3837 #ifdef CONFIG_NET_CLS_ACT
3838         if (skb->tc_verd & TC_NCLS) {
3839                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3840                 goto ncls;
3841         }
3842 #endif
3843
3844         if (pfmemalloc)
3845                 goto skip_taps;
3846
3847         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3848                 if (pt_prev)
3849                         ret = deliver_skb(skb, pt_prev, orig_dev);
3850                 pt_prev = ptype;
3851         }
3852
3853         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3854                 if (pt_prev)
3855                         ret = deliver_skb(skb, pt_prev, orig_dev);
3856                 pt_prev = ptype;
3857         }
3858
3859 skip_taps:
3860 #ifdef CONFIG_NET_INGRESS
3861         if (static_key_false(&ingress_needed)) {
3862                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3863                 if (!skb)
3864                         goto out;
3865
3866                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3867                         goto out;
3868         }
3869 #endif
3870 #ifdef CONFIG_NET_CLS_ACT
3871         skb->tc_verd = 0;
3872 ncls:
3873 #endif
3874         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3875                 goto drop;
3876
3877         if (skb_vlan_tag_present(skb)) {
3878                 if (pt_prev) {
3879                         ret = deliver_skb(skb, pt_prev, orig_dev);
3880                         pt_prev = NULL;
3881                 }
3882                 if (vlan_do_receive(&skb))
3883                         goto another_round;
3884                 else if (unlikely(!skb))
3885                         goto out;
3886         }
3887
3888         rx_handler = rcu_dereference(skb->dev->rx_handler);
3889         if (rx_handler) {
3890                 if (pt_prev) {
3891                         ret = deliver_skb(skb, pt_prev, orig_dev);
3892                         pt_prev = NULL;
3893                 }
3894                 switch (rx_handler(&skb)) {
3895                 case RX_HANDLER_CONSUMED:
3896                         ret = NET_RX_SUCCESS;
3897                         goto out;
3898                 case RX_HANDLER_ANOTHER:
3899                         goto another_round;
3900                 case RX_HANDLER_EXACT:
3901                         deliver_exact = true;
3902                 case RX_HANDLER_PASS:
3903                         break;
3904                 default:
3905                         BUG();
3906                 }
3907         }
3908
3909         if (unlikely(skb_vlan_tag_present(skb))) {
3910                 if (skb_vlan_tag_get_id(skb))
3911                         skb->pkt_type = PACKET_OTHERHOST;
3912                 /* Note: we might in the future use prio bits
3913                  * and set skb->priority like in vlan_do_receive()
3914                  * For the time being, just ignore Priority Code Point
3915                  */
3916                 skb->vlan_tci = 0;
3917         }
3918
3919         type = skb->protocol;
3920
3921         /* deliver only exact match when indicated */
3922         if (likely(!deliver_exact)) {
3923                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3924                                        &ptype_base[ntohs(type) &
3925                                                    PTYPE_HASH_MASK]);
3926         }
3927
3928         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3929                                &orig_dev->ptype_specific);
3930
3931         if (unlikely(skb->dev != orig_dev)) {
3932                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3933                                        &skb->dev->ptype_specific);
3934         }
3935
3936         if (pt_prev) {
3937                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3938                         goto drop;
3939                 else
3940                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3941         } else {
3942 drop:
3943                 atomic_long_inc(&skb->dev->rx_dropped);
3944                 kfree_skb(skb);
3945                 /* Jamal, now you will not able to escape explaining
3946                  * me how you were going to use this. :-)
3947                  */
3948                 ret = NET_RX_DROP;
3949         }
3950
3951 out:
3952         return ret;
3953 }
3954
3955 static int __netif_receive_skb(struct sk_buff *skb)
3956 {
3957         int ret;
3958
3959         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3960                 unsigned long pflags = current->flags;
3961
3962                 /*
3963                  * PFMEMALLOC skbs are special, they should
3964                  * - be delivered to SOCK_MEMALLOC sockets only
3965                  * - stay away from userspace
3966                  * - have bounded memory usage
3967                  *
3968                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3969                  * context down to all allocation sites.
3970                  */
3971                 current->flags |= PF_MEMALLOC;
3972                 ret = __netif_receive_skb_core(skb, true);
3973                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3974         } else
3975                 ret = __netif_receive_skb_core(skb, false);
3976
3977         return ret;
3978 }
3979
3980 static int netif_receive_skb_internal(struct sk_buff *skb)
3981 {
3982         int ret;
3983
3984         net_timestamp_check(netdev_tstamp_prequeue, skb);
3985
3986         if (skb_defer_rx_timestamp(skb))
3987                 return NET_RX_SUCCESS;
3988
3989         rcu_read_lock();
3990
3991 #ifdef CONFIG_RPS
3992         if (static_key_false(&rps_needed)) {
3993                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3994                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3995
3996                 if (cpu >= 0) {
3997                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3998                         rcu_read_unlock();
3999                         return ret;
4000                 }
4001         }
4002 #endif
4003         ret = __netif_receive_skb(skb);
4004         rcu_read_unlock();
4005         return ret;
4006 }
4007
4008 /**
4009  *      netif_receive_skb - process receive buffer from network
4010  *      @skb: buffer to process
4011  *
4012  *      netif_receive_skb() is the main receive data processing function.
4013  *      It always succeeds. The buffer may be dropped during processing
4014  *      for congestion control or by the protocol layers.
4015  *
4016  *      This function may only be called from softirq context and interrupts
4017  *      should be enabled.
4018  *
4019  *      Return values (usually ignored):
4020  *      NET_RX_SUCCESS: no congestion
4021  *      NET_RX_DROP: packet was dropped
4022  */
4023 int netif_receive_skb(struct sk_buff *skb)
4024 {
4025         trace_netif_receive_skb_entry(skb);
4026
4027         return netif_receive_skb_internal(skb);
4028 }
4029 EXPORT_SYMBOL(netif_receive_skb);
4030
4031 /* Network device is going away, flush any packets still pending
4032  * Called with irqs disabled.
4033  */
4034 static void flush_backlog(void *arg)
4035 {
4036         struct net_device *dev = arg;
4037         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4038         struct sk_buff *skb, *tmp;
4039
4040         rps_lock(sd);
4041         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4042                 if (skb->dev == dev) {
4043                         __skb_unlink(skb, &sd->input_pkt_queue);
4044                         kfree_skb(skb);
4045                         input_queue_head_incr(sd);
4046                 }
4047         }
4048         rps_unlock(sd);
4049
4050         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4051                 if (skb->dev == dev) {
4052                         __skb_unlink(skb, &sd->process_queue);
4053                         kfree_skb(skb);
4054                         input_queue_head_incr(sd);
4055                 }
4056         }
4057 }
4058
4059 static int napi_gro_complete(struct sk_buff *skb)
4060 {
4061         struct packet_offload *ptype;
4062         __be16 type = skb->protocol;
4063         struct list_head *head = &offload_base;
4064         int err = -ENOENT;
4065
4066         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4067
4068         if (NAPI_GRO_CB(skb)->count == 1) {
4069                 skb_shinfo(skb)->gso_size = 0;
4070                 goto out;
4071         }
4072
4073         rcu_read_lock();
4074         list_for_each_entry_rcu(ptype, head, list) {
4075                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4076                         continue;
4077
4078                 err = ptype->callbacks.gro_complete(skb, 0);
4079                 break;
4080         }
4081         rcu_read_unlock();
4082
4083         if (err) {
4084                 WARN_ON(&ptype->list == head);
4085                 kfree_skb(skb);
4086                 return NET_RX_SUCCESS;
4087         }
4088
4089 out:
4090         return netif_receive_skb_internal(skb);
4091 }
4092
4093 /* napi->gro_list contains packets ordered by age.
4094  * youngest packets at the head of it.
4095  * Complete skbs in reverse order to reduce latencies.
4096  */
4097 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4098 {
4099         struct sk_buff *skb, *prev = NULL;
4100
4101         /* scan list and build reverse chain */
4102         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4103                 skb->prev = prev;
4104                 prev = skb;
4105         }
4106
4107         for (skb = prev; skb; skb = prev) {
4108                 skb->next = NULL;
4109
4110                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4111                         return;
4112
4113                 prev = skb->prev;
4114                 napi_gro_complete(skb);
4115                 napi->gro_count--;
4116         }
4117
4118         napi->gro_list = NULL;
4119 }
4120 EXPORT_SYMBOL(napi_gro_flush);
4121
4122 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4123 {
4124         struct sk_buff *p;
4125         unsigned int maclen = skb->dev->hard_header_len;
4126         u32 hash = skb_get_hash_raw(skb);
4127
4128         for (p = napi->gro_list; p; p = p->next) {
4129                 unsigned long diffs;
4130
4131                 NAPI_GRO_CB(p)->flush = 0;
4132
4133                 if (hash != skb_get_hash_raw(p)) {
4134                         NAPI_GRO_CB(p)->same_flow = 0;
4135                         continue;
4136                 }
4137
4138                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4139                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4140                 if (maclen == ETH_HLEN)
4141                         diffs |= compare_ether_header(skb_mac_header(p),
4142                                                       skb_mac_header(skb));
4143                 else if (!diffs)
4144                         diffs = memcmp(skb_mac_header(p),
4145                                        skb_mac_header(skb),
4146                                        maclen);
4147                 NAPI_GRO_CB(p)->same_flow = !diffs;
4148         }
4149 }
4150
4151 static void skb_gro_reset_offset(struct sk_buff *skb)
4152 {
4153         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4154         const skb_frag_t *frag0 = &pinfo->frags[0];
4155
4156         NAPI_GRO_CB(skb)->data_offset = 0;
4157         NAPI_GRO_CB(skb)->frag0 = NULL;
4158         NAPI_GRO_CB(skb)->frag0_len = 0;
4159
4160         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4161             pinfo->nr_frags &&
4162             !PageHighMem(skb_frag_page(frag0))) {
4163                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4164                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4165         }
4166 }
4167
4168 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4169 {
4170         struct skb_shared_info *pinfo = skb_shinfo(skb);
4171
4172         BUG_ON(skb->end - skb->tail < grow);
4173
4174         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4175
4176         skb->data_len -= grow;
4177         skb->tail += grow;
4178
4179         pinfo->frags[0].page_offset += grow;
4180         skb_frag_size_sub(&pinfo->frags[0], grow);
4181
4182         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4183                 skb_frag_unref(skb, 0);
4184                 memmove(pinfo->frags, pinfo->frags + 1,
4185                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4186         }
4187 }
4188
4189 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4190 {
4191         struct sk_buff **pp = NULL;
4192         struct packet_offload *ptype;
4193         __be16 type = skb->protocol;
4194         struct list_head *head = &offload_base;
4195         int same_flow;
4196         enum gro_result ret;
4197         int grow;
4198
4199         if (!(skb->dev->features & NETIF_F_GRO))
4200                 goto normal;
4201
4202         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4203                 goto normal;
4204
4205         gro_list_prepare(napi, skb);
4206
4207         rcu_read_lock();
4208         list_for_each_entry_rcu(ptype, head, list) {
4209                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4210                         continue;
4211
4212                 skb_set_network_header(skb, skb_gro_offset(skb));
4213                 skb_reset_mac_len(skb);
4214                 NAPI_GRO_CB(skb)->same_flow = 0;
4215                 NAPI_GRO_CB(skb)->flush = 0;
4216                 NAPI_GRO_CB(skb)->free = 0;
4217                 NAPI_GRO_CB(skb)->udp_mark = 0;
4218                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4219
4220                 /* Setup for GRO checksum validation */
4221                 switch (skb->ip_summed) {
4222                 case CHECKSUM_COMPLETE:
4223                         NAPI_GRO_CB(skb)->csum = skb->csum;
4224                         NAPI_GRO_CB(skb)->csum_valid = 1;
4225                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4226                         break;
4227                 case CHECKSUM_UNNECESSARY:
4228                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4229                         NAPI_GRO_CB(skb)->csum_valid = 0;
4230                         break;
4231                 default:
4232                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4233                         NAPI_GRO_CB(skb)->csum_valid = 0;
4234                 }
4235
4236                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4237                 break;
4238         }
4239         rcu_read_unlock();
4240
4241         if (&ptype->list == head)
4242                 goto normal;
4243
4244         same_flow = NAPI_GRO_CB(skb)->same_flow;
4245         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4246
4247         if (pp) {
4248                 struct sk_buff *nskb = *pp;
4249
4250                 *pp = nskb->next;
4251                 nskb->next = NULL;
4252                 napi_gro_complete(nskb);
4253                 napi->gro_count--;
4254         }
4255
4256         if (same_flow)
4257                 goto ok;
4258
4259         if (NAPI_GRO_CB(skb)->flush)
4260                 goto normal;
4261
4262         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4263                 struct sk_buff *nskb = napi->gro_list;
4264
4265                 /* locate the end of the list to select the 'oldest' flow */
4266                 while (nskb->next) {
4267                         pp = &nskb->next;
4268                         nskb = *pp;
4269                 }
4270                 *pp = NULL;
4271                 nskb->next = NULL;
4272                 napi_gro_complete(nskb);
4273         } else {
4274                 napi->gro_count++;
4275         }
4276         NAPI_GRO_CB(skb)->count = 1;
4277         NAPI_GRO_CB(skb)->age = jiffies;
4278         NAPI_GRO_CB(skb)->last = skb;
4279         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4280         skb->next = napi->gro_list;
4281         napi->gro_list = skb;
4282         ret = GRO_HELD;
4283
4284 pull:
4285         grow = skb_gro_offset(skb) - skb_headlen(skb);
4286         if (grow > 0)
4287                 gro_pull_from_frag0(skb, grow);
4288 ok:
4289         return ret;
4290
4291 normal:
4292         ret = GRO_NORMAL;
4293         goto pull;
4294 }
4295
4296 struct packet_offload *gro_find_receive_by_type(__be16 type)
4297 {
4298         struct list_head *offload_head = &offload_base;
4299         struct packet_offload *ptype;
4300
4301         list_for_each_entry_rcu(ptype, offload_head, list) {
4302                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4303                         continue;
4304                 return ptype;
4305         }
4306         return NULL;
4307 }
4308 EXPORT_SYMBOL(gro_find_receive_by_type);
4309
4310 struct packet_offload *gro_find_complete_by_type(__be16 type)
4311 {
4312         struct list_head *offload_head = &offload_base;
4313         struct packet_offload *ptype;
4314
4315         list_for_each_entry_rcu(ptype, offload_head, list) {
4316                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4317                         continue;
4318                 return ptype;
4319         }
4320         return NULL;
4321 }
4322 EXPORT_SYMBOL(gro_find_complete_by_type);
4323
4324 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4325 {
4326         switch (ret) {
4327         case GRO_NORMAL:
4328                 if (netif_receive_skb_internal(skb))
4329                         ret = GRO_DROP;
4330                 break;
4331
4332         case GRO_DROP:
4333                 kfree_skb(skb);
4334                 break;
4335
4336         case GRO_MERGED_FREE:
4337                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4338                         kmem_cache_free(skbuff_head_cache, skb);
4339                 else
4340                         __kfree_skb(skb);
4341                 break;
4342
4343         case GRO_HELD:
4344         case GRO_MERGED:
4345                 break;
4346         }
4347
4348         return ret;
4349 }
4350
4351 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4352 {
4353         trace_napi_gro_receive_entry(skb);
4354
4355         skb_gro_reset_offset(skb);
4356
4357         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4358 }
4359 EXPORT_SYMBOL(napi_gro_receive);
4360
4361 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4362 {
4363         if (unlikely(skb->pfmemalloc)) {
4364                 consume_skb(skb);
4365                 return;
4366         }
4367         __skb_pull(skb, skb_headlen(skb));
4368         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4369         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4370         skb->vlan_tci = 0;
4371         skb->dev = napi->dev;
4372         skb->skb_iif = 0;
4373         skb->encapsulation = 0;
4374         skb_shinfo(skb)->gso_type = 0;
4375         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4376
4377         napi->skb = skb;
4378 }
4379
4380 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4381 {
4382         struct sk_buff *skb = napi->skb;
4383
4384         if (!skb) {
4385                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4386                 napi->skb = skb;
4387         }
4388         return skb;
4389 }
4390 EXPORT_SYMBOL(napi_get_frags);
4391
4392 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4393                                       struct sk_buff *skb,
4394                                       gro_result_t ret)
4395 {
4396         switch (ret) {
4397         case GRO_NORMAL:
4398         case GRO_HELD:
4399                 __skb_push(skb, ETH_HLEN);
4400                 skb->protocol = eth_type_trans(skb, skb->dev);
4401                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4402                         ret = GRO_DROP;
4403                 break;
4404
4405         case GRO_DROP:
4406         case GRO_MERGED_FREE:
4407                 napi_reuse_skb(napi, skb);
4408                 break;
4409
4410         case GRO_MERGED:
4411                 break;
4412         }
4413
4414         return ret;
4415 }
4416
4417 /* Upper GRO stack assumes network header starts at gro_offset=0
4418  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4419  * We copy ethernet header into skb->data to have a common layout.
4420  */
4421 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4422 {
4423         struct sk_buff *skb = napi->skb;
4424         const struct ethhdr *eth;
4425         unsigned int hlen = sizeof(*eth);
4426
4427         napi->skb = NULL;
4428
4429         skb_reset_mac_header(skb);
4430         skb_gro_reset_offset(skb);
4431
4432         eth = skb_gro_header_fast(skb, 0);
4433         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4434                 eth = skb_gro_header_slow(skb, hlen, 0);
4435                 if (unlikely(!eth)) {
4436                         napi_reuse_skb(napi, skb);
4437                         return NULL;
4438                 }
4439         } else {
4440                 gro_pull_from_frag0(skb, hlen);
4441                 NAPI_GRO_CB(skb)->frag0 += hlen;
4442                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4443         }
4444         __skb_pull(skb, hlen);
4445
4446         /*
4447          * This works because the only protocols we care about don't require
4448          * special handling.
4449          * We'll fix it up properly in napi_frags_finish()
4450          */
4451         skb->protocol = eth->h_proto;
4452
4453         return skb;
4454 }
4455
4456 gro_result_t napi_gro_frags(struct napi_struct *napi)
4457 {
4458         struct sk_buff *skb = napi_frags_skb(napi);
4459
4460         if (!skb)
4461                 return GRO_DROP;
4462
4463         trace_napi_gro_frags_entry(skb);
4464
4465         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4466 }
4467 EXPORT_SYMBOL(napi_gro_frags);
4468
4469 /* Compute the checksum from gro_offset and return the folded value
4470  * after adding in any pseudo checksum.
4471  */
4472 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4473 {
4474         __wsum wsum;
4475         __sum16 sum;
4476
4477         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4478
4479         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4480         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4481         if (likely(!sum)) {
4482                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4483                     !skb->csum_complete_sw)
4484                         netdev_rx_csum_fault(skb->dev);
4485         }
4486
4487         NAPI_GRO_CB(skb)->csum = wsum;
4488         NAPI_GRO_CB(skb)->csum_valid = 1;
4489
4490         return sum;
4491 }
4492 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4493
4494 /*
4495  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4496  * Note: called with local irq disabled, but exits with local irq enabled.
4497  */
4498 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4499 {
4500 #ifdef CONFIG_RPS
4501         struct softnet_data *remsd = sd->rps_ipi_list;
4502
4503         if (remsd) {
4504                 sd->rps_ipi_list = NULL;
4505
4506                 local_irq_enable();
4507
4508                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4509                 while (remsd) {
4510                         struct softnet_data *next = remsd->rps_ipi_next;
4511
4512                         if (cpu_online(remsd->cpu))
4513                                 smp_call_function_single_async(remsd->cpu,
4514                                                            &remsd->csd);
4515                         remsd = next;
4516                 }
4517         } else
4518 #endif
4519                 local_irq_enable();
4520 }
4521
4522 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4523 {
4524 #ifdef CONFIG_RPS
4525         return sd->rps_ipi_list != NULL;
4526 #else
4527         return false;
4528 #endif
4529 }
4530
4531 static int process_backlog(struct napi_struct *napi, int quota)
4532 {
4533         int work = 0;
4534         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4535
4536         /* Check if we have pending ipi, its better to send them now,
4537          * not waiting net_rx_action() end.
4538          */
4539         if (sd_has_rps_ipi_waiting(sd)) {
4540                 local_irq_disable();
4541                 net_rps_action_and_irq_enable(sd);
4542         }
4543
4544         napi->weight = weight_p;
4545         local_irq_disable();
4546         while (1) {
4547                 struct sk_buff *skb;
4548
4549                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4550                         rcu_read_lock();
4551                         local_irq_enable();
4552                         __netif_receive_skb(skb);
4553                         rcu_read_unlock();
4554                         local_irq_disable();
4555                         input_queue_head_incr(sd);
4556                         if (++work >= quota) {
4557                                 local_irq_enable();
4558                                 return work;
4559                         }
4560                 }
4561
4562                 rps_lock(sd);
4563                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4564                         /*
4565                          * Inline a custom version of __napi_complete().
4566                          * only current cpu owns and manipulates this napi,
4567                          * and NAPI_STATE_SCHED is the only possible flag set
4568                          * on backlog.
4569                          * We can use a plain write instead of clear_bit(),
4570                          * and we dont need an smp_mb() memory barrier.
4571                          */
4572                         napi->state = 0;
4573                         rps_unlock(sd);
4574
4575                         break;
4576                 }
4577
4578                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4579                                            &sd->process_queue);
4580                 rps_unlock(sd);
4581         }
4582         local_irq_enable();
4583
4584         return work;
4585 }
4586
4587 /**
4588  * __napi_schedule - schedule for receive
4589  * @n: entry to schedule
4590  *
4591  * The entry's receive function will be scheduled to run.
4592  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4593  */
4594 void __napi_schedule(struct napi_struct *n)
4595 {
4596         unsigned long flags;
4597
4598         local_irq_save(flags);
4599         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4600         local_irq_restore(flags);
4601 }
4602 EXPORT_SYMBOL(__napi_schedule);
4603
4604 /**
4605  * __napi_schedule_irqoff - schedule for receive
4606  * @n: entry to schedule
4607  *
4608  * Variant of __napi_schedule() assuming hard irqs are masked
4609  */
4610 void __napi_schedule_irqoff(struct napi_struct *n)
4611 {
4612         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4613 }
4614 EXPORT_SYMBOL(__napi_schedule_irqoff);
4615
4616 void __napi_complete(struct napi_struct *n)
4617 {
4618         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4619
4620         list_del_init(&n->poll_list);
4621         smp_mb__before_atomic();
4622         clear_bit(NAPI_STATE_SCHED, &n->state);
4623 }
4624 EXPORT_SYMBOL(__napi_complete);
4625
4626 void napi_complete_done(struct napi_struct *n, int work_done)
4627 {
4628         unsigned long flags;
4629
4630         /*
4631          * don't let napi dequeue from the cpu poll list
4632          * just in case its running on a different cpu
4633          */
4634         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4635                 return;
4636
4637         if (n->gro_list) {
4638                 unsigned long timeout = 0;
4639
4640                 if (work_done)
4641                         timeout = n->dev->gro_flush_timeout;
4642
4643                 if (timeout)
4644                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4645                                       HRTIMER_MODE_REL_PINNED);
4646                 else
4647                         napi_gro_flush(n, false);
4648         }
4649         if (likely(list_empty(&n->poll_list))) {
4650                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4651         } else {
4652                 /* If n->poll_list is not empty, we need to mask irqs */
4653                 local_irq_save(flags);
4654                 __napi_complete(n);
4655                 local_irq_restore(flags);
4656         }
4657 }
4658 EXPORT_SYMBOL(napi_complete_done);
4659
4660 /* must be called under rcu_read_lock(), as we dont take a reference */
4661 struct napi_struct *napi_by_id(unsigned int napi_id)
4662 {
4663         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4664         struct napi_struct *napi;
4665
4666         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4667                 if (napi->napi_id == napi_id)
4668                         return napi;
4669
4670         return NULL;
4671 }
4672 EXPORT_SYMBOL_GPL(napi_by_id);
4673
4674 void napi_hash_add(struct napi_struct *napi)
4675 {
4676         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4677
4678                 spin_lock(&napi_hash_lock);
4679
4680                 /* 0 is not a valid id, we also skip an id that is taken
4681                  * we expect both events to be extremely rare
4682                  */
4683                 napi->napi_id = 0;
4684                 while (!napi->napi_id) {
4685                         napi->napi_id = ++napi_gen_id;
4686                         if (napi_by_id(napi->napi_id))
4687                                 napi->napi_id = 0;
4688                 }
4689
4690                 hlist_add_head_rcu(&napi->napi_hash_node,
4691                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4692
4693                 spin_unlock(&napi_hash_lock);
4694         }
4695 }
4696 EXPORT_SYMBOL_GPL(napi_hash_add);
4697
4698 /* Warning : caller is responsible to make sure rcu grace period
4699  * is respected before freeing memory containing @napi
4700  */
4701 void napi_hash_del(struct napi_struct *napi)
4702 {
4703         spin_lock(&napi_hash_lock);
4704
4705         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4706                 hlist_del_rcu(&napi->napi_hash_node);
4707
4708         spin_unlock(&napi_hash_lock);
4709 }
4710 EXPORT_SYMBOL_GPL(napi_hash_del);
4711
4712 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4713 {
4714         struct napi_struct *napi;
4715
4716         napi = container_of(timer, struct napi_struct, timer);
4717         if (napi->gro_list)
4718                 napi_schedule(napi);
4719
4720         return HRTIMER_NORESTART;
4721 }
4722
4723 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4724                     int (*poll)(struct napi_struct *, int), int weight)
4725 {
4726         INIT_LIST_HEAD(&napi->poll_list);
4727         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4728         napi->timer.function = napi_watchdog;
4729         napi->gro_count = 0;
4730         napi->gro_list = NULL;
4731         napi->skb = NULL;
4732         napi->poll = poll;
4733         if (weight > NAPI_POLL_WEIGHT)
4734                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4735                             weight, dev->name);
4736         napi->weight = weight;
4737         list_add(&napi->dev_list, &dev->napi_list);
4738         napi->dev = dev;
4739 #ifdef CONFIG_NETPOLL
4740         spin_lock_init(&napi->poll_lock);
4741         napi->poll_owner = -1;
4742 #endif
4743         set_bit(NAPI_STATE_SCHED, &napi->state);
4744 }
4745 EXPORT_SYMBOL(netif_napi_add);
4746
4747 void napi_disable(struct napi_struct *n)
4748 {
4749         might_sleep();
4750         set_bit(NAPI_STATE_DISABLE, &n->state);
4751
4752         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4753                 msleep(1);
4754         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4755                 msleep(1);
4756
4757         hrtimer_cancel(&n->timer);
4758
4759         clear_bit(NAPI_STATE_DISABLE, &n->state);
4760 }
4761 EXPORT_SYMBOL(napi_disable);
4762
4763 void netif_napi_del(struct napi_struct *napi)
4764 {
4765         list_del_init(&napi->dev_list);
4766         napi_free_frags(napi);
4767
4768         kfree_skb_list(napi->gro_list);
4769         napi->gro_list = NULL;
4770         napi->gro_count = 0;
4771 }
4772 EXPORT_SYMBOL(netif_napi_del);
4773
4774 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4775 {
4776         void *have;
4777         int work, weight;
4778
4779         list_del_init(&n->poll_list);
4780
4781         have = netpoll_poll_lock(n);
4782
4783         weight = n->weight;
4784
4785         /* This NAPI_STATE_SCHED test is for avoiding a race
4786          * with netpoll's poll_napi().  Only the entity which
4787          * obtains the lock and sees NAPI_STATE_SCHED set will
4788          * actually make the ->poll() call.  Therefore we avoid
4789          * accidentally calling ->poll() when NAPI is not scheduled.
4790          */
4791         work = 0;
4792         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4793                 work = n->poll(n, weight);
4794                 trace_napi_poll(n);
4795         }
4796
4797         WARN_ON_ONCE(work > weight);
4798
4799         if (likely(work < weight))
4800                 goto out_unlock;
4801
4802         /* Drivers must not modify the NAPI state if they
4803          * consume the entire weight.  In such cases this code
4804          * still "owns" the NAPI instance and therefore can
4805          * move the instance around on the list at-will.
4806          */
4807         if (unlikely(napi_disable_pending(n))) {
4808                 napi_complete(n);
4809                 goto out_unlock;
4810         }
4811
4812         if (n->gro_list) {
4813                 /* flush too old packets
4814                  * If HZ < 1000, flush all packets.
4815                  */
4816                 napi_gro_flush(n, HZ >= 1000);
4817         }
4818
4819         /* Some drivers may have called napi_schedule
4820          * prior to exhausting their budget.
4821          */
4822         if (unlikely(!list_empty(&n->poll_list))) {
4823                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4824                              n->dev ? n->dev->name : "backlog");
4825                 goto out_unlock;
4826         }
4827
4828         list_add_tail(&n->poll_list, repoll);
4829
4830 out_unlock:
4831         netpoll_poll_unlock(have);
4832
4833         return work;
4834 }
4835
4836 static void net_rx_action(struct softirq_action *h)
4837 {
4838         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4839         unsigned long time_limit = jiffies + 2;
4840         int budget = netdev_budget;
4841         LIST_HEAD(list);
4842         LIST_HEAD(repoll);
4843
4844         local_irq_disable();
4845         list_splice_init(&sd->poll_list, &list);
4846         local_irq_enable();
4847
4848         for (;;) {
4849                 struct napi_struct *n;
4850
4851                 if (list_empty(&list)) {
4852                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4853                                 return;
4854                         break;
4855                 }
4856
4857                 n = list_first_entry(&list, struct napi_struct, poll_list);
4858                 budget -= napi_poll(n, &repoll);
4859
4860                 /* If softirq window is exhausted then punt.
4861                  * Allow this to run for 2 jiffies since which will allow
4862                  * an average latency of 1.5/HZ.
4863                  */
4864                 if (unlikely(budget <= 0 ||
4865                              time_after_eq(jiffies, time_limit))) {
4866                         sd->time_squeeze++;
4867                         break;
4868                 }
4869         }
4870
4871         local_irq_disable();
4872
4873         list_splice_tail_init(&sd->poll_list, &list);
4874         list_splice_tail(&repoll, &list);
4875         list_splice(&list, &sd->poll_list);
4876         if (!list_empty(&sd->poll_list))
4877                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4878
4879         net_rps_action_and_irq_enable(sd);
4880 }
4881
4882 struct netdev_adjacent {
4883         struct net_device *dev;
4884
4885         /* upper master flag, there can only be one master device per list */
4886         bool master;
4887
4888         /* counter for the number of times this device was added to us */
4889         u16 ref_nr;
4890
4891         /* private field for the users */
4892         void *private;
4893
4894         struct list_head list;
4895         struct rcu_head rcu;
4896 };
4897
4898 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
4899                                                  struct list_head *adj_list)
4900 {
4901         struct netdev_adjacent *adj;
4902
4903         list_for_each_entry(adj, adj_list, list) {
4904                 if (adj->dev == adj_dev)
4905                         return adj;
4906         }
4907         return NULL;
4908 }
4909
4910 /**
4911  * netdev_has_upper_dev - Check if device is linked to an upper device
4912  * @dev: device
4913  * @upper_dev: upper device to check
4914  *
4915  * Find out if a device is linked to specified upper device and return true
4916  * in case it is. Note that this checks only immediate upper device,
4917  * not through a complete stack of devices. The caller must hold the RTNL lock.
4918  */
4919 bool netdev_has_upper_dev(struct net_device *dev,
4920                           struct net_device *upper_dev)
4921 {
4922         ASSERT_RTNL();
4923
4924         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
4925 }
4926 EXPORT_SYMBOL(netdev_has_upper_dev);
4927
4928 /**
4929  * netdev_has_any_upper_dev - Check if device is linked to some device
4930  * @dev: device
4931  *
4932  * Find out if a device is linked to an upper device and return true in case
4933  * it is. The caller must hold the RTNL lock.
4934  */
4935 static bool netdev_has_any_upper_dev(struct net_device *dev)
4936 {
4937         ASSERT_RTNL();
4938
4939         return !list_empty(&dev->all_adj_list.upper);
4940 }
4941
4942 /**
4943  * netdev_master_upper_dev_get - Get master upper device
4944  * @dev: device
4945  *
4946  * Find a master upper device and return pointer to it or NULL in case
4947  * it's not there. The caller must hold the RTNL lock.
4948  */
4949 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4950 {
4951         struct netdev_adjacent *upper;
4952
4953         ASSERT_RTNL();
4954
4955         if (list_empty(&dev->adj_list.upper))
4956                 return NULL;
4957
4958         upper = list_first_entry(&dev->adj_list.upper,
4959                                  struct netdev_adjacent, list);
4960         if (likely(upper->master))
4961                 return upper->dev;
4962         return NULL;
4963 }
4964 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4965
4966 void *netdev_adjacent_get_private(struct list_head *adj_list)
4967 {
4968         struct netdev_adjacent *adj;
4969
4970         adj = list_entry(adj_list, struct netdev_adjacent, list);
4971
4972         return adj->private;
4973 }
4974 EXPORT_SYMBOL(netdev_adjacent_get_private);
4975
4976 /**
4977  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4978  * @dev: device
4979  * @iter: list_head ** of the current position
4980  *
4981  * Gets the next device from the dev's upper list, starting from iter
4982  * position. The caller must hold RCU read lock.
4983  */
4984 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4985                                                  struct list_head **iter)
4986 {
4987         struct netdev_adjacent *upper;
4988
4989         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4990
4991         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4992
4993         if (&upper->list == &dev->adj_list.upper)
4994                 return NULL;
4995
4996         *iter = &upper->list;
4997
4998         return upper->dev;
4999 }
5000 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5001
5002 /**
5003  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5004  * @dev: device
5005  * @iter: list_head ** of the current position
5006  *
5007  * Gets the next device from the dev's upper list, starting from iter
5008  * position. The caller must hold RCU read lock.
5009  */
5010 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5011                                                      struct list_head **iter)
5012 {
5013         struct netdev_adjacent *upper;
5014
5015         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5016
5017         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5018
5019         if (&upper->list == &dev->all_adj_list.upper)
5020                 return NULL;
5021
5022         *iter = &upper->list;
5023
5024         return upper->dev;
5025 }
5026 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5027
5028 /**
5029  * netdev_lower_get_next_private - Get the next ->private from the
5030  *                                 lower neighbour list
5031  * @dev: device
5032  * @iter: list_head ** of the current position
5033  *
5034  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5035  * list, starting from iter position. The caller must hold either hold the
5036  * RTNL lock or its own locking that guarantees that the neighbour lower
5037  * list will remain unchanged.
5038  */
5039 void *netdev_lower_get_next_private(struct net_device *dev,
5040                                     struct list_head **iter)
5041 {
5042         struct netdev_adjacent *lower;
5043
5044         lower = list_entry(*iter, struct netdev_adjacent, list);
5045
5046         if (&lower->list == &dev->adj_list.lower)
5047                 return NULL;
5048
5049         *iter = lower->list.next;
5050
5051         return lower->private;
5052 }
5053 EXPORT_SYMBOL(netdev_lower_get_next_private);
5054
5055 /**
5056  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5057  *                                     lower neighbour list, RCU
5058  *                                     variant
5059  * @dev: device
5060  * @iter: list_head ** of the current position
5061  *
5062  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5063  * list, starting from iter position. The caller must hold RCU read lock.
5064  */
5065 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5066                                         struct list_head **iter)
5067 {
5068         struct netdev_adjacent *lower;
5069
5070         WARN_ON_ONCE(!rcu_read_lock_held());
5071
5072         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5073
5074         if (&lower->list == &dev->adj_list.lower)
5075                 return NULL;
5076
5077         *iter = &lower->list;
5078
5079         return lower->private;
5080 }
5081 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5082
5083 /**
5084  * netdev_lower_get_next - Get the next device from the lower neighbour
5085  *                         list
5086  * @dev: device
5087  * @iter: list_head ** of the current position
5088  *
5089  * Gets the next netdev_adjacent from the dev's lower neighbour
5090  * list, starting from iter position. The caller must hold RTNL lock or
5091  * its own locking that guarantees that the neighbour lower
5092  * list will remain unchanged.
5093  */
5094 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5095 {
5096         struct netdev_adjacent *lower;
5097
5098         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5099
5100         if (&lower->list == &dev->adj_list.lower)
5101                 return NULL;
5102
5103         *iter = &lower->list;
5104
5105         return lower->dev;
5106 }
5107 EXPORT_SYMBOL(netdev_lower_get_next);
5108
5109 /**
5110  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5111  *                                     lower neighbour list, RCU
5112  *                                     variant
5113  * @dev: device
5114  *
5115  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5116  * list. The caller must hold RCU read lock.
5117  */
5118 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5119 {
5120         struct netdev_adjacent *lower;
5121
5122         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5123                         struct netdev_adjacent, list);
5124         if (lower)
5125                 return lower->private;
5126         return NULL;
5127 }
5128 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5129
5130 /**
5131  * netdev_master_upper_dev_get_rcu - Get master upper device
5132  * @dev: device
5133  *
5134  * Find a master upper device and return pointer to it or NULL in case
5135  * it's not there. The caller must hold the RCU read lock.
5136  */
5137 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5138 {
5139         struct netdev_adjacent *upper;
5140
5141         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5142                                        struct netdev_adjacent, list);
5143         if (upper && likely(upper->master))
5144                 return upper->dev;
5145         return NULL;
5146 }
5147 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5148
5149 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5150                               struct net_device *adj_dev,
5151                               struct list_head *dev_list)
5152 {
5153         char linkname[IFNAMSIZ+7];
5154         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5155                 "upper_%s" : "lower_%s", adj_dev->name);
5156         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5157                                  linkname);
5158 }
5159 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5160                                char *name,
5161                                struct list_head *dev_list)
5162 {
5163         char linkname[IFNAMSIZ+7];
5164         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5165                 "upper_%s" : "lower_%s", name);
5166         sysfs_remove_link(&(dev->dev.kobj), linkname);
5167 }
5168
5169 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5170                                                  struct net_device *adj_dev,
5171                                                  struct list_head *dev_list)
5172 {
5173         return (dev_list == &dev->adj_list.upper ||
5174                 dev_list == &dev->adj_list.lower) &&
5175                 net_eq(dev_net(dev), dev_net(adj_dev));
5176 }
5177
5178 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5179                                         struct net_device *adj_dev,
5180                                         struct list_head *dev_list,
5181                                         void *private, bool master)
5182 {
5183         struct netdev_adjacent *adj;
5184         int ret;
5185
5186         adj = __netdev_find_adj(adj_dev, dev_list);
5187
5188         if (adj) {
5189                 adj->ref_nr++;
5190                 return 0;
5191         }
5192
5193         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5194         if (!adj)
5195                 return -ENOMEM;
5196
5197         adj->dev = adj_dev;
5198         adj->master = master;
5199         adj->ref_nr = 1;
5200         adj->private = private;
5201         dev_hold(adj_dev);
5202
5203         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5204                  adj_dev->name, dev->name, adj_dev->name);
5205
5206         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5207                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5208                 if (ret)
5209                         goto free_adj;
5210         }
5211
5212         /* Ensure that master link is always the first item in list. */
5213         if (master) {
5214                 ret = sysfs_create_link(&(dev->dev.kobj),
5215                                         &(adj_dev->dev.kobj), "master");
5216                 if (ret)
5217                         goto remove_symlinks;
5218
5219                 list_add_rcu(&adj->list, dev_list);
5220         } else {
5221                 list_add_tail_rcu(&adj->list, dev_list);
5222         }
5223
5224         return 0;
5225
5226 remove_symlinks:
5227         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5228                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5229 free_adj:
5230         kfree(adj);
5231         dev_put(adj_dev);
5232
5233         return ret;
5234 }
5235
5236 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5237                                          struct net_device *adj_dev,
5238                                          struct list_head *dev_list)
5239 {
5240         struct netdev_adjacent *adj;
5241
5242         adj = __netdev_find_adj(adj_dev, dev_list);
5243
5244         if (!adj) {
5245                 pr_err("tried to remove device %s from %s\n",
5246                        dev->name, adj_dev->name);
5247                 BUG();
5248         }
5249
5250         if (adj->ref_nr > 1) {
5251                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5252                          adj->ref_nr-1);
5253                 adj->ref_nr--;
5254                 return;
5255         }
5256
5257         if (adj->master)
5258                 sysfs_remove_link(&(dev->dev.kobj), "master");
5259
5260         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5261                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5262
5263         list_del_rcu(&adj->list);
5264         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5265                  adj_dev->name, dev->name, adj_dev->name);
5266         dev_put(adj_dev);
5267         kfree_rcu(adj, rcu);
5268 }
5269
5270 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5271                                             struct net_device *upper_dev,
5272                                             struct list_head *up_list,
5273                                             struct list_head *down_list,
5274                                             void *private, bool master)
5275 {
5276         int ret;
5277
5278         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5279                                            master);
5280         if (ret)
5281                 return ret;
5282
5283         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5284                                            false);
5285         if (ret) {
5286                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5287                 return ret;
5288         }
5289
5290         return 0;
5291 }
5292
5293 static int __netdev_adjacent_dev_link(struct net_device *dev,
5294                                       struct net_device *upper_dev)
5295 {
5296         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5297                                                 &dev->all_adj_list.upper,
5298                                                 &upper_dev->all_adj_list.lower,
5299                                                 NULL, false);
5300 }
5301
5302 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5303                                                struct net_device *upper_dev,
5304                                                struct list_head *up_list,
5305                                                struct list_head *down_list)
5306 {
5307         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5308         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5309 }
5310
5311 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5312                                          struct net_device *upper_dev)
5313 {
5314         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5315                                            &dev->all_adj_list.upper,
5316                                            &upper_dev->all_adj_list.lower);
5317 }
5318
5319 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5320                                                 struct net_device *upper_dev,
5321                                                 void *private, bool master)
5322 {
5323         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5324
5325         if (ret)
5326                 return ret;
5327
5328         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5329                                                &dev->adj_list.upper,
5330                                                &upper_dev->adj_list.lower,
5331                                                private, master);
5332         if (ret) {
5333                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5334                 return ret;
5335         }
5336
5337         return 0;
5338 }
5339
5340 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5341                                                    struct net_device *upper_dev)
5342 {
5343         __netdev_adjacent_dev_unlink(dev, upper_dev);
5344         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5345                                            &dev->adj_list.upper,
5346                                            &upper_dev->adj_list.lower);
5347 }
5348
5349 static int __netdev_upper_dev_link(struct net_device *dev,
5350                                    struct net_device *upper_dev, bool master,
5351                                    void *private)
5352 {
5353         struct netdev_notifier_changeupper_info changeupper_info;
5354         struct netdev_adjacent *i, *j, *to_i, *to_j;
5355         int ret = 0;
5356
5357         ASSERT_RTNL();
5358
5359         if (dev == upper_dev)
5360                 return -EBUSY;
5361
5362         /* To prevent loops, check if dev is not upper device to upper_dev. */
5363         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5364                 return -EBUSY;
5365
5366         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5367                 return -EEXIST;
5368
5369         if (master && netdev_master_upper_dev_get(dev))
5370                 return -EBUSY;
5371
5372         changeupper_info.upper_dev = upper_dev;
5373         changeupper_info.master = master;
5374         changeupper_info.linking = true;
5375
5376         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5377                                             &changeupper_info.info);
5378         ret = notifier_to_errno(ret);
5379         if (ret)
5380                 return ret;
5381
5382         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5383                                                    master);
5384         if (ret)
5385                 return ret;
5386
5387         /* Now that we linked these devs, make all the upper_dev's
5388          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5389          * versa, and don't forget the devices itself. All of these
5390          * links are non-neighbours.
5391          */
5392         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5393                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5394                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5395                                  i->dev->name, j->dev->name);
5396                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5397                         if (ret)
5398                                 goto rollback_mesh;
5399                 }
5400         }
5401
5402         /* add dev to every upper_dev's upper device */
5403         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5404                 pr_debug("linking %s's upper device %s with %s\n",
5405                          upper_dev->name, i->dev->name, dev->name);
5406                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5407                 if (ret)
5408                         goto rollback_upper_mesh;
5409         }
5410
5411         /* add upper_dev to every dev's lower device */
5412         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5413                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5414                          i->dev->name, upper_dev->name);
5415                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5416                 if (ret)
5417                         goto rollback_lower_mesh;
5418         }
5419
5420         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5421                                       &changeupper_info.info);
5422         return 0;
5423
5424 rollback_lower_mesh:
5425         to_i = i;
5426         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5427                 if (i == to_i)
5428                         break;
5429                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5430         }
5431
5432         i = NULL;
5433
5434 rollback_upper_mesh:
5435         to_i = i;
5436         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5437                 if (i == to_i)
5438                         break;
5439                 __netdev_adjacent_dev_unlink(dev, i->dev);
5440         }
5441
5442         i = j = NULL;
5443
5444 rollback_mesh:
5445         to_i = i;
5446         to_j = j;
5447         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5448                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5449                         if (i == to_i && j == to_j)
5450                                 break;
5451                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5452                 }
5453                 if (i == to_i)
5454                         break;
5455         }
5456
5457         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5458
5459         return ret;
5460 }
5461
5462 /**
5463  * netdev_upper_dev_link - Add a link to the upper device
5464  * @dev: device
5465  * @upper_dev: new upper device
5466  *
5467  * Adds a link to device which is upper to this one. The caller must hold
5468  * the RTNL lock. On a failure a negative errno code is returned.
5469  * On success the reference counts are adjusted and the function
5470  * returns zero.
5471  */
5472 int netdev_upper_dev_link(struct net_device *dev,
5473                           struct net_device *upper_dev)
5474 {
5475         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5476 }
5477 EXPORT_SYMBOL(netdev_upper_dev_link);
5478
5479 /**
5480  * netdev_master_upper_dev_link - Add a master link to the upper device
5481  * @dev: device
5482  * @upper_dev: new upper device
5483  *
5484  * Adds a link to device which is upper to this one. In this case, only
5485  * one master upper device can be linked, although other non-master devices
5486  * might be linked as well. The caller must hold the RTNL lock.
5487  * On a failure a negative errno code is returned. On success the reference
5488  * counts are adjusted and the function returns zero.
5489  */
5490 int netdev_master_upper_dev_link(struct net_device *dev,
5491                                  struct net_device *upper_dev)
5492 {
5493         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5494 }
5495 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5496
5497 int netdev_master_upper_dev_link_private(struct net_device *dev,
5498                                          struct net_device *upper_dev,
5499                                          void *private)
5500 {
5501         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5502 }
5503 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5504
5505 /**
5506  * netdev_upper_dev_unlink - Removes a link to upper device
5507  * @dev: device
5508  * @upper_dev: new upper device
5509  *
5510  * Removes a link to device which is upper to this one. The caller must hold
5511  * the RTNL lock.
5512  */
5513 void netdev_upper_dev_unlink(struct net_device *dev,
5514                              struct net_device *upper_dev)
5515 {
5516         struct netdev_notifier_changeupper_info changeupper_info;
5517         struct netdev_adjacent *i, *j;
5518         ASSERT_RTNL();
5519
5520         changeupper_info.upper_dev = upper_dev;
5521         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5522         changeupper_info.linking = false;
5523
5524         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5525                                       &changeupper_info.info);
5526
5527         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5528
5529         /* Here is the tricky part. We must remove all dev's lower
5530          * devices from all upper_dev's upper devices and vice
5531          * versa, to maintain the graph relationship.
5532          */
5533         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5534                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5535                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5536
5537         /* remove also the devices itself from lower/upper device
5538          * list
5539          */
5540         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5541                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5542
5543         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5544                 __netdev_adjacent_dev_unlink(dev, i->dev);
5545
5546         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5547                                       &changeupper_info.info);
5548 }
5549 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5550
5551 /**
5552  * netdev_bonding_info_change - Dispatch event about slave change
5553  * @dev: device
5554  * @bonding_info: info to dispatch
5555  *
5556  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5557  * The caller must hold the RTNL lock.
5558  */
5559 void netdev_bonding_info_change(struct net_device *dev,
5560                                 struct netdev_bonding_info *bonding_info)
5561 {
5562         struct netdev_notifier_bonding_info     info;
5563
5564         memcpy(&info.bonding_info, bonding_info,
5565                sizeof(struct netdev_bonding_info));
5566         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5567                                       &info.info);
5568 }
5569 EXPORT_SYMBOL(netdev_bonding_info_change);
5570
5571 static void netdev_adjacent_add_links(struct net_device *dev)
5572 {
5573         struct netdev_adjacent *iter;
5574
5575         struct net *net = dev_net(dev);
5576
5577         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5578                 if (!net_eq(net,dev_net(iter->dev)))
5579                         continue;
5580                 netdev_adjacent_sysfs_add(iter->dev, dev,
5581                                           &iter->dev->adj_list.lower);
5582                 netdev_adjacent_sysfs_add(dev, iter->dev,
5583                                           &dev->adj_list.upper);
5584         }
5585
5586         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5587                 if (!net_eq(net,dev_net(iter->dev)))
5588                         continue;
5589                 netdev_adjacent_sysfs_add(iter->dev, dev,
5590                                           &iter->dev->adj_list.upper);
5591                 netdev_adjacent_sysfs_add(dev, iter->dev,
5592                                           &dev->adj_list.lower);
5593         }
5594 }
5595
5596 static void netdev_adjacent_del_links(struct net_device *dev)
5597 {
5598         struct netdev_adjacent *iter;
5599
5600         struct net *net = dev_net(dev);
5601
5602         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5603                 if (!net_eq(net,dev_net(iter->dev)))
5604                         continue;
5605                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5606                                           &iter->dev->adj_list.lower);
5607                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5608                                           &dev->adj_list.upper);
5609         }
5610
5611         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5612                 if (!net_eq(net,dev_net(iter->dev)))
5613                         continue;
5614                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5615                                           &iter->dev->adj_list.upper);
5616                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5617                                           &dev->adj_list.lower);
5618         }
5619 }
5620
5621 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5622 {
5623         struct netdev_adjacent *iter;
5624
5625         struct net *net = dev_net(dev);
5626
5627         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5628                 if (!net_eq(net,dev_net(iter->dev)))
5629                         continue;
5630                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5631                                           &iter->dev->adj_list.lower);
5632                 netdev_adjacent_sysfs_add(iter->dev, dev,
5633                                           &iter->dev->adj_list.lower);
5634         }
5635
5636         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5637                 if (!net_eq(net,dev_net(iter->dev)))
5638                         continue;
5639                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5640                                           &iter->dev->adj_list.upper);
5641                 netdev_adjacent_sysfs_add(iter->dev, dev,
5642                                           &iter->dev->adj_list.upper);
5643         }
5644 }
5645
5646 void *netdev_lower_dev_get_private(struct net_device *dev,
5647                                    struct net_device *lower_dev)
5648 {
5649         struct netdev_adjacent *lower;
5650
5651         if (!lower_dev)
5652                 return NULL;
5653         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5654         if (!lower)
5655                 return NULL;
5656
5657         return lower->private;
5658 }
5659 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5660
5661
5662 int dev_get_nest_level(struct net_device *dev,
5663                        bool (*type_check)(struct net_device *dev))
5664 {
5665         struct net_device *lower = NULL;
5666         struct list_head *iter;
5667         int max_nest = -1;
5668         int nest;
5669
5670         ASSERT_RTNL();
5671
5672         netdev_for_each_lower_dev(dev, lower, iter) {
5673                 nest = dev_get_nest_level(lower, type_check);
5674                 if (max_nest < nest)
5675                         max_nest = nest;
5676         }
5677
5678         if (type_check(dev))
5679                 max_nest++;
5680
5681         return max_nest;
5682 }
5683 EXPORT_SYMBOL(dev_get_nest_level);
5684
5685 static void dev_change_rx_flags(struct net_device *dev, int flags)
5686 {
5687         const struct net_device_ops *ops = dev->netdev_ops;
5688
5689         if (ops->ndo_change_rx_flags)
5690                 ops->ndo_change_rx_flags(dev, flags);
5691 }
5692
5693 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5694 {
5695         unsigned int old_flags = dev->flags;
5696         kuid_t uid;
5697         kgid_t gid;
5698
5699         ASSERT_RTNL();
5700
5701         dev->flags |= IFF_PROMISC;
5702         dev->promiscuity += inc;
5703         if (dev->promiscuity == 0) {
5704                 /*
5705                  * Avoid overflow.
5706                  * If inc causes overflow, untouch promisc and return error.
5707                  */
5708                 if (inc < 0)
5709                         dev->flags &= ~IFF_PROMISC;
5710                 else {
5711                         dev->promiscuity -= inc;
5712                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5713                                 dev->name);
5714                         return -EOVERFLOW;
5715                 }
5716         }
5717         if (dev->flags != old_flags) {
5718                 pr_info("device %s %s promiscuous mode\n",
5719                         dev->name,
5720                         dev->flags & IFF_PROMISC ? "entered" : "left");
5721                 if (audit_enabled) {
5722                         current_uid_gid(&uid, &gid);
5723                         audit_log(current->audit_context, GFP_ATOMIC,
5724                                 AUDIT_ANOM_PROMISCUOUS,
5725                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5726                                 dev->name, (dev->flags & IFF_PROMISC),
5727                                 (old_flags & IFF_PROMISC),
5728                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5729                                 from_kuid(&init_user_ns, uid),
5730                                 from_kgid(&init_user_ns, gid),
5731                                 audit_get_sessionid(current));
5732                 }
5733
5734                 dev_change_rx_flags(dev, IFF_PROMISC);
5735         }
5736         if (notify)
5737                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5738         return 0;
5739 }
5740
5741 /**
5742  *      dev_set_promiscuity     - update promiscuity count on a device
5743  *      @dev: device
5744  *      @inc: modifier
5745  *
5746  *      Add or remove promiscuity from a device. While the count in the device
5747  *      remains above zero the interface remains promiscuous. Once it hits zero
5748  *      the device reverts back to normal filtering operation. A negative inc
5749  *      value is used to drop promiscuity on the device.
5750  *      Return 0 if successful or a negative errno code on error.
5751  */
5752 int dev_set_promiscuity(struct net_device *dev, int inc)
5753 {
5754         unsigned int old_flags = dev->flags;
5755         int err;
5756
5757         err = __dev_set_promiscuity(dev, inc, true);
5758         if (err < 0)
5759                 return err;
5760         if (dev->flags != old_flags)
5761                 dev_set_rx_mode(dev);
5762         return err;
5763 }
5764 EXPORT_SYMBOL(dev_set_promiscuity);
5765
5766 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5767 {
5768         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5769
5770         ASSERT_RTNL();
5771
5772         dev->flags |= IFF_ALLMULTI;
5773         dev->allmulti += inc;
5774         if (dev->allmulti == 0) {
5775                 /*
5776                  * Avoid overflow.
5777                  * If inc causes overflow, untouch allmulti and return error.
5778                  */
5779                 if (inc < 0)
5780                         dev->flags &= ~IFF_ALLMULTI;
5781                 else {
5782                         dev->allmulti -= inc;
5783                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5784                                 dev->name);
5785                         return -EOVERFLOW;
5786                 }
5787         }
5788         if (dev->flags ^ old_flags) {
5789                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5790                 dev_set_rx_mode(dev);
5791                 if (notify)
5792                         __dev_notify_flags(dev, old_flags,
5793                                            dev->gflags ^ old_gflags);
5794         }
5795         return 0;
5796 }
5797
5798 /**
5799  *      dev_set_allmulti        - update allmulti count on a device
5800  *      @dev: device
5801  *      @inc: modifier
5802  *
5803  *      Add or remove reception of all multicast frames to a device. While the
5804  *      count in the device remains above zero the interface remains listening
5805  *      to all interfaces. Once it hits zero the device reverts back to normal
5806  *      filtering operation. A negative @inc value is used to drop the counter
5807  *      when releasing a resource needing all multicasts.
5808  *      Return 0 if successful or a negative errno code on error.
5809  */
5810
5811 int dev_set_allmulti(struct net_device *dev, int inc)
5812 {
5813         return __dev_set_allmulti(dev, inc, true);
5814 }
5815 EXPORT_SYMBOL(dev_set_allmulti);
5816
5817 /*
5818  *      Upload unicast and multicast address lists to device and
5819  *      configure RX filtering. When the device doesn't support unicast
5820  *      filtering it is put in promiscuous mode while unicast addresses
5821  *      are present.
5822  */
5823 void __dev_set_rx_mode(struct net_device *dev)
5824 {
5825         const struct net_device_ops *ops = dev->netdev_ops;
5826
5827         /* dev_open will call this function so the list will stay sane. */
5828         if (!(dev->flags&IFF_UP))
5829                 return;
5830
5831         if (!netif_device_present(dev))
5832                 return;
5833
5834         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5835                 /* Unicast addresses changes may only happen under the rtnl,
5836                  * therefore calling __dev_set_promiscuity here is safe.
5837                  */
5838                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5839                         __dev_set_promiscuity(dev, 1, false);
5840                         dev->uc_promisc = true;
5841                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5842                         __dev_set_promiscuity(dev, -1, false);
5843                         dev->uc_promisc = false;
5844                 }
5845         }
5846
5847         if (ops->ndo_set_rx_mode)
5848                 ops->ndo_set_rx_mode(dev);
5849 }
5850
5851 void dev_set_rx_mode(struct net_device *dev)
5852 {
5853         netif_addr_lock_bh(dev);
5854         __dev_set_rx_mode(dev);
5855         netif_addr_unlock_bh(dev);
5856 }
5857
5858 /**
5859  *      dev_get_flags - get flags reported to userspace
5860  *      @dev: device
5861  *
5862  *      Get the combination of flag bits exported through APIs to userspace.
5863  */
5864 unsigned int dev_get_flags(const struct net_device *dev)
5865 {
5866         unsigned int flags;
5867
5868         flags = (dev->flags & ~(IFF_PROMISC |
5869                                 IFF_ALLMULTI |
5870                                 IFF_RUNNING |
5871                                 IFF_LOWER_UP |
5872                                 IFF_DORMANT)) |
5873                 (dev->gflags & (IFF_PROMISC |
5874                                 IFF_ALLMULTI));
5875
5876         if (netif_running(dev)) {
5877                 if (netif_oper_up(dev))
5878                         flags |= IFF_RUNNING;
5879                 if (netif_carrier_ok(dev))
5880                         flags |= IFF_LOWER_UP;
5881                 if (netif_dormant(dev))
5882                         flags |= IFF_DORMANT;
5883         }
5884
5885         return flags;
5886 }
5887 EXPORT_SYMBOL(dev_get_flags);
5888
5889 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5890 {
5891         unsigned int old_flags = dev->flags;
5892         int ret;
5893
5894         ASSERT_RTNL();
5895
5896         /*
5897          *      Set the flags on our device.
5898          */
5899
5900         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5901                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5902                                IFF_AUTOMEDIA)) |
5903                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5904                                     IFF_ALLMULTI));
5905
5906         /*
5907          *      Load in the correct multicast list now the flags have changed.
5908          */
5909
5910         if ((old_flags ^ flags) & IFF_MULTICAST)
5911                 dev_change_rx_flags(dev, IFF_MULTICAST);
5912
5913         dev_set_rx_mode(dev);
5914
5915         /*
5916          *      Have we downed the interface. We handle IFF_UP ourselves
5917          *      according to user attempts to set it, rather than blindly
5918          *      setting it.
5919          */
5920
5921         ret = 0;
5922         if ((old_flags ^ flags) & IFF_UP)
5923                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5924
5925         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5926                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5927                 unsigned int old_flags = dev->flags;
5928
5929                 dev->gflags ^= IFF_PROMISC;
5930
5931                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5932                         if (dev->flags != old_flags)
5933                                 dev_set_rx_mode(dev);
5934         }
5935
5936         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5937            is important. Some (broken) drivers set IFF_PROMISC, when
5938            IFF_ALLMULTI is requested not asking us and not reporting.
5939          */
5940         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5941                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5942
5943                 dev->gflags ^= IFF_ALLMULTI;
5944                 __dev_set_allmulti(dev, inc, false);
5945         }
5946
5947         return ret;
5948 }
5949
5950 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5951                         unsigned int gchanges)
5952 {
5953         unsigned int changes = dev->flags ^ old_flags;
5954
5955         if (gchanges)
5956                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5957
5958         if (changes & IFF_UP) {
5959                 if (dev->flags & IFF_UP)
5960                         call_netdevice_notifiers(NETDEV_UP, dev);
5961                 else
5962                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5963         }
5964
5965         if (dev->flags & IFF_UP &&
5966             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5967                 struct netdev_notifier_change_info change_info;
5968
5969                 change_info.flags_changed = changes;
5970                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5971                                               &change_info.info);
5972         }
5973 }
5974
5975 /**
5976  *      dev_change_flags - change device settings
5977  *      @dev: device
5978  *      @flags: device state flags
5979  *
5980  *      Change settings on device based state flags. The flags are
5981  *      in the userspace exported format.
5982  */
5983 int dev_change_flags(struct net_device *dev, unsigned int flags)
5984 {
5985         int ret;
5986         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5987
5988         ret = __dev_change_flags(dev, flags);
5989         if (ret < 0)
5990                 return ret;
5991
5992         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5993         __dev_notify_flags(dev, old_flags, changes);
5994         return ret;
5995 }
5996 EXPORT_SYMBOL(dev_change_flags);
5997
5998 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5999 {
6000         const struct net_device_ops *ops = dev->netdev_ops;
6001
6002         if (ops->ndo_change_mtu)
6003                 return ops->ndo_change_mtu(dev, new_mtu);
6004
6005         dev->mtu = new_mtu;
6006         return 0;
6007 }
6008
6009 /**
6010  *      dev_set_mtu - Change maximum transfer unit
6011  *      @dev: device
6012  *      @new_mtu: new transfer unit
6013  *
6014  *      Change the maximum transfer size of the network device.
6015  */
6016 int dev_set_mtu(struct net_device *dev, int new_mtu)
6017 {
6018         int err, orig_mtu;
6019
6020         if (new_mtu == dev->mtu)
6021                 return 0;
6022
6023         /*      MTU must be positive.    */
6024         if (new_mtu < 0)
6025                 return -EINVAL;
6026
6027         if (!netif_device_present(dev))
6028                 return -ENODEV;
6029
6030         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6031         err = notifier_to_errno(err);
6032         if (err)
6033                 return err;
6034
6035         orig_mtu = dev->mtu;
6036         err = __dev_set_mtu(dev, new_mtu);
6037
6038         if (!err) {
6039                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6040                 err = notifier_to_errno(err);
6041                 if (err) {
6042                         /* setting mtu back and notifying everyone again,
6043                          * so that they have a chance to revert changes.
6044                          */
6045                         __dev_set_mtu(dev, orig_mtu);
6046                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6047                 }
6048         }
6049         return err;
6050 }
6051 EXPORT_SYMBOL(dev_set_mtu);
6052
6053 /**
6054  *      dev_set_group - Change group this device belongs to
6055  *      @dev: device
6056  *      @new_group: group this device should belong to
6057  */
6058 void dev_set_group(struct net_device *dev, int new_group)
6059 {
6060         dev->group = new_group;
6061 }
6062 EXPORT_SYMBOL(dev_set_group);
6063
6064 /**
6065  *      dev_set_mac_address - Change Media Access Control Address
6066  *      @dev: device
6067  *      @sa: new address
6068  *
6069  *      Change the hardware (MAC) address of the device
6070  */
6071 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6072 {
6073         const struct net_device_ops *ops = dev->netdev_ops;
6074         int err;
6075
6076         if (!ops->ndo_set_mac_address)
6077                 return -EOPNOTSUPP;
6078         if (sa->sa_family != dev->type)
6079                 return -EINVAL;
6080         if (!netif_device_present(dev))
6081                 return -ENODEV;
6082         err = ops->ndo_set_mac_address(dev, sa);
6083         if (err)
6084                 return err;
6085         dev->addr_assign_type = NET_ADDR_SET;
6086         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6087         add_device_randomness(dev->dev_addr, dev->addr_len);
6088         return 0;
6089 }
6090 EXPORT_SYMBOL(dev_set_mac_address);
6091
6092 /**
6093  *      dev_change_carrier - Change device carrier
6094  *      @dev: device
6095  *      @new_carrier: new value
6096  *
6097  *      Change device carrier
6098  */
6099 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6100 {
6101         const struct net_device_ops *ops = dev->netdev_ops;
6102
6103         if (!ops->ndo_change_carrier)
6104                 return -EOPNOTSUPP;
6105         if (!netif_device_present(dev))
6106                 return -ENODEV;
6107         return ops->ndo_change_carrier(dev, new_carrier);
6108 }
6109 EXPORT_SYMBOL(dev_change_carrier);
6110
6111 /**
6112  *      dev_get_phys_port_id - Get device physical port ID
6113  *      @dev: device
6114  *      @ppid: port ID
6115  *
6116  *      Get device physical port ID
6117  */
6118 int dev_get_phys_port_id(struct net_device *dev,
6119                          struct netdev_phys_item_id *ppid)
6120 {
6121         const struct net_device_ops *ops = dev->netdev_ops;
6122
6123         if (!ops->ndo_get_phys_port_id)
6124                 return -EOPNOTSUPP;
6125         return ops->ndo_get_phys_port_id(dev, ppid);
6126 }
6127 EXPORT_SYMBOL(dev_get_phys_port_id);
6128
6129 /**
6130  *      dev_get_phys_port_name - Get device physical port name
6131  *      @dev: device
6132  *      @name: port name
6133  *
6134  *      Get device physical port name
6135  */
6136 int dev_get_phys_port_name(struct net_device *dev,
6137                            char *name, size_t len)
6138 {
6139         const struct net_device_ops *ops = dev->netdev_ops;
6140
6141         if (!ops->ndo_get_phys_port_name)
6142                 return -EOPNOTSUPP;
6143         return ops->ndo_get_phys_port_name(dev, name, len);
6144 }
6145 EXPORT_SYMBOL(dev_get_phys_port_name);
6146
6147 /**
6148  *      dev_change_proto_down - update protocol port state information
6149  *      @dev: device
6150  *      @proto_down: new value
6151  *
6152  *      This info can be used by switch drivers to set the phys state of the
6153  *      port.
6154  */
6155 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6156 {
6157         const struct net_device_ops *ops = dev->netdev_ops;
6158
6159         if (!ops->ndo_change_proto_down)
6160                 return -EOPNOTSUPP;
6161         if (!netif_device_present(dev))
6162                 return -ENODEV;
6163         return ops->ndo_change_proto_down(dev, proto_down);
6164 }
6165 EXPORT_SYMBOL(dev_change_proto_down);
6166
6167 /**
6168  *      dev_new_index   -       allocate an ifindex
6169  *      @net: the applicable net namespace
6170  *
6171  *      Returns a suitable unique value for a new device interface
6172  *      number.  The caller must hold the rtnl semaphore or the
6173  *      dev_base_lock to be sure it remains unique.
6174  */
6175 static int dev_new_index(struct net *net)
6176 {
6177         int ifindex = net->ifindex;
6178         for (;;) {
6179                 if (++ifindex <= 0)
6180                         ifindex = 1;
6181                 if (!__dev_get_by_index(net, ifindex))
6182                         return net->ifindex = ifindex;
6183         }
6184 }
6185
6186 /* Delayed registration/unregisteration */
6187 static LIST_HEAD(net_todo_list);
6188 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6189
6190 static void net_set_todo(struct net_device *dev)
6191 {
6192         list_add_tail(&dev->todo_list, &net_todo_list);
6193         dev_net(dev)->dev_unreg_count++;
6194 }
6195
6196 static void rollback_registered_many(struct list_head *head)
6197 {
6198         struct net_device *dev, *tmp;
6199         LIST_HEAD(close_head);
6200
6201         BUG_ON(dev_boot_phase);
6202         ASSERT_RTNL();
6203
6204         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6205                 /* Some devices call without registering
6206                  * for initialization unwind. Remove those
6207                  * devices and proceed with the remaining.
6208                  */
6209                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6210                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6211                                  dev->name, dev);
6212
6213                         WARN_ON(1);
6214                         list_del(&dev->unreg_list);
6215                         continue;
6216                 }
6217                 dev->dismantle = true;
6218                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6219         }
6220
6221         /* If device is running, close it first. */
6222         list_for_each_entry(dev, head, unreg_list)
6223                 list_add_tail(&dev->close_list, &close_head);
6224         dev_close_many(&close_head, true);
6225
6226         list_for_each_entry(dev, head, unreg_list) {
6227                 /* And unlink it from device chain. */
6228                 unlist_netdevice(dev);
6229
6230                 dev->reg_state = NETREG_UNREGISTERING;
6231                 on_each_cpu(flush_backlog, dev, 1);
6232         }
6233
6234         synchronize_net();
6235
6236         list_for_each_entry(dev, head, unreg_list) {
6237                 struct sk_buff *skb = NULL;
6238
6239                 /* Shutdown queueing discipline. */
6240                 dev_shutdown(dev);
6241
6242
6243                 /* Notify protocols, that we are about to destroy
6244                    this device. They should clean all the things.
6245                 */
6246                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6247
6248                 if (!dev->rtnl_link_ops ||
6249                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6250                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6251                                                      GFP_KERNEL);
6252
6253                 /*
6254                  *      Flush the unicast and multicast chains
6255                  */
6256                 dev_uc_flush(dev);
6257                 dev_mc_flush(dev);
6258
6259                 if (dev->netdev_ops->ndo_uninit)
6260                         dev->netdev_ops->ndo_uninit(dev);
6261
6262                 if (skb)
6263                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6264
6265                 /* Notifier chain MUST detach us all upper devices. */
6266                 WARN_ON(netdev_has_any_upper_dev(dev));
6267
6268                 /* Remove entries from kobject tree */
6269                 netdev_unregister_kobject(dev);
6270 #ifdef CONFIG_XPS
6271                 /* Remove XPS queueing entries */
6272                 netif_reset_xps_queues_gt(dev, 0);
6273 #endif
6274         }
6275
6276         synchronize_net();
6277
6278         list_for_each_entry(dev, head, unreg_list)
6279                 dev_put(dev);
6280 }
6281
6282 static void rollback_registered(struct net_device *dev)
6283 {
6284         LIST_HEAD(single);
6285
6286         list_add(&dev->unreg_list, &single);
6287         rollback_registered_many(&single);
6288         list_del(&single);
6289 }
6290
6291 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6292         struct net_device *upper, netdev_features_t features)
6293 {
6294         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6295         netdev_features_t feature;
6296         int feature_bit;
6297
6298         for_each_netdev_feature(&upper_disables, feature_bit) {
6299                 feature = __NETIF_F_BIT(feature_bit);
6300                 if (!(upper->wanted_features & feature)
6301                     && (features & feature)) {
6302                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6303                                    &feature, upper->name);
6304                         features &= ~feature;
6305                 }
6306         }
6307
6308         return features;
6309 }
6310
6311 static void netdev_sync_lower_features(struct net_device *upper,
6312         struct net_device *lower, netdev_features_t features)
6313 {
6314         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6315         netdev_features_t feature;
6316         int feature_bit;
6317
6318         for_each_netdev_feature(&upper_disables, feature_bit) {
6319                 feature = __NETIF_F_BIT(feature_bit);
6320                 if (!(features & feature) && (lower->features & feature)) {
6321                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6322                                    &feature, lower->name);
6323                         lower->wanted_features &= ~feature;
6324                         netdev_update_features(lower);
6325
6326                         if (unlikely(lower->features & feature))
6327                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6328                                             &feature, lower->name);
6329                 }
6330         }
6331 }
6332
6333 static netdev_features_t netdev_fix_features(struct net_device *dev,
6334         netdev_features_t features)
6335 {
6336         /* Fix illegal checksum combinations */
6337         if ((features & NETIF_F_HW_CSUM) &&
6338             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6339                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6340                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6341         }
6342
6343         /* TSO requires that SG is present as well. */
6344         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6345                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6346                 features &= ~NETIF_F_ALL_TSO;
6347         }
6348
6349         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6350                                         !(features & NETIF_F_IP_CSUM)) {
6351                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6352                 features &= ~NETIF_F_TSO;
6353                 features &= ~NETIF_F_TSO_ECN;
6354         }
6355
6356         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6357                                          !(features & NETIF_F_IPV6_CSUM)) {
6358                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6359                 features &= ~NETIF_F_TSO6;
6360         }
6361
6362         /* TSO ECN requires that TSO is present as well. */
6363         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6364                 features &= ~NETIF_F_TSO_ECN;
6365
6366         /* Software GSO depends on SG. */
6367         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6368                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6369                 features &= ~NETIF_F_GSO;
6370         }
6371
6372         /* UFO needs SG and checksumming */
6373         if (features & NETIF_F_UFO) {
6374                 /* maybe split UFO into V4 and V6? */
6375                 if (!((features & NETIF_F_GEN_CSUM) ||
6376                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6377                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6378                         netdev_dbg(dev,
6379                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6380                         features &= ~NETIF_F_UFO;
6381                 }
6382
6383                 if (!(features & NETIF_F_SG)) {
6384                         netdev_dbg(dev,
6385                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6386                         features &= ~NETIF_F_UFO;
6387                 }
6388         }
6389
6390 #ifdef CONFIG_NET_RX_BUSY_POLL
6391         if (dev->netdev_ops->ndo_busy_poll)
6392                 features |= NETIF_F_BUSY_POLL;
6393         else
6394 #endif
6395                 features &= ~NETIF_F_BUSY_POLL;
6396
6397         return features;
6398 }
6399
6400 int __netdev_update_features(struct net_device *dev)
6401 {
6402         struct net_device *upper, *lower;
6403         netdev_features_t features;
6404         struct list_head *iter;
6405         int err = 0;
6406
6407         ASSERT_RTNL();
6408
6409         features = netdev_get_wanted_features(dev);
6410
6411         if (dev->netdev_ops->ndo_fix_features)
6412                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6413
6414         /* driver might be less strict about feature dependencies */
6415         features = netdev_fix_features(dev, features);
6416
6417         /* some features can't be enabled if they're off an an upper device */
6418         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6419                 features = netdev_sync_upper_features(dev, upper, features);
6420
6421         if (dev->features == features)
6422                 return 0;
6423
6424         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6425                 &dev->features, &features);
6426
6427         if (dev->netdev_ops->ndo_set_features)
6428                 err = dev->netdev_ops->ndo_set_features(dev, features);
6429
6430         if (unlikely(err < 0)) {
6431                 netdev_err(dev,
6432                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6433                         err, &features, &dev->features);
6434                 return -1;
6435         }
6436
6437         /* some features must be disabled on lower devices when disabled
6438          * on an upper device (think: bonding master or bridge)
6439          */
6440         netdev_for_each_lower_dev(dev, lower, iter)
6441                 netdev_sync_lower_features(dev, lower, features);
6442
6443         if (!err)
6444                 dev->features = features;
6445
6446         return 1;
6447 }
6448
6449 /**
6450  *      netdev_update_features - recalculate device features
6451  *      @dev: the device to check
6452  *
6453  *      Recalculate dev->features set and send notifications if it
6454  *      has changed. Should be called after driver or hardware dependent
6455  *      conditions might have changed that influence the features.
6456  */
6457 void netdev_update_features(struct net_device *dev)
6458 {
6459         if (__netdev_update_features(dev))
6460                 netdev_features_change(dev);
6461 }
6462 EXPORT_SYMBOL(netdev_update_features);
6463
6464 /**
6465  *      netdev_change_features - recalculate device features
6466  *      @dev: the device to check
6467  *
6468  *      Recalculate dev->features set and send notifications even
6469  *      if they have not changed. Should be called instead of
6470  *      netdev_update_features() if also dev->vlan_features might
6471  *      have changed to allow the changes to be propagated to stacked
6472  *      VLAN devices.
6473  */
6474 void netdev_change_features(struct net_device *dev)
6475 {
6476         __netdev_update_features(dev);
6477         netdev_features_change(dev);
6478 }
6479 EXPORT_SYMBOL(netdev_change_features);
6480
6481 /**
6482  *      netif_stacked_transfer_operstate -      transfer operstate
6483  *      @rootdev: the root or lower level device to transfer state from
6484  *      @dev: the device to transfer operstate to
6485  *
6486  *      Transfer operational state from root to device. This is normally
6487  *      called when a stacking relationship exists between the root
6488  *      device and the device(a leaf device).
6489  */
6490 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6491                                         struct net_device *dev)
6492 {
6493         if (rootdev->operstate == IF_OPER_DORMANT)
6494                 netif_dormant_on(dev);
6495         else
6496                 netif_dormant_off(dev);
6497
6498         if (netif_carrier_ok(rootdev)) {
6499                 if (!netif_carrier_ok(dev))
6500                         netif_carrier_on(dev);
6501         } else {
6502                 if (netif_carrier_ok(dev))
6503                         netif_carrier_off(dev);
6504         }
6505 }
6506 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6507
6508 #ifdef CONFIG_SYSFS
6509 static int netif_alloc_rx_queues(struct net_device *dev)
6510 {
6511         unsigned int i, count = dev->num_rx_queues;
6512         struct netdev_rx_queue *rx;
6513         size_t sz = count * sizeof(*rx);
6514
6515         BUG_ON(count < 1);
6516
6517         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6518         if (!rx) {
6519                 rx = vzalloc(sz);
6520                 if (!rx)
6521                         return -ENOMEM;
6522         }
6523         dev->_rx = rx;
6524
6525         for (i = 0; i < count; i++)
6526                 rx[i].dev = dev;
6527         return 0;
6528 }
6529 #endif
6530
6531 static void netdev_init_one_queue(struct net_device *dev,
6532                                   struct netdev_queue *queue, void *_unused)
6533 {
6534         /* Initialize queue lock */
6535         spin_lock_init(&queue->_xmit_lock);
6536         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6537         queue->xmit_lock_owner = -1;
6538         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6539         queue->dev = dev;
6540 #ifdef CONFIG_BQL
6541         dql_init(&queue->dql, HZ);
6542 #endif
6543 }
6544
6545 static void netif_free_tx_queues(struct net_device *dev)
6546 {
6547         kvfree(dev->_tx);
6548 }
6549
6550 static int netif_alloc_netdev_queues(struct net_device *dev)
6551 {
6552         unsigned int count = dev->num_tx_queues;
6553         struct netdev_queue *tx;
6554         size_t sz = count * sizeof(*tx);
6555
6556         if (count < 1 || count > 0xffff)
6557                 return -EINVAL;
6558
6559         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6560         if (!tx) {
6561                 tx = vzalloc(sz);
6562                 if (!tx)
6563                         return -ENOMEM;
6564         }
6565         dev->_tx = tx;
6566
6567         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6568         spin_lock_init(&dev->tx_global_lock);
6569
6570         return 0;
6571 }
6572
6573 void netif_tx_stop_all_queues(struct net_device *dev)
6574 {
6575         unsigned int i;
6576
6577         for (i = 0; i < dev->num_tx_queues; i++) {
6578                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6579                 netif_tx_stop_queue(txq);
6580         }
6581 }
6582 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6583
6584 /**
6585  *      register_netdevice      - register a network device
6586  *      @dev: device to register
6587  *
6588  *      Take a completed network device structure and add it to the kernel
6589  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6590  *      chain. 0 is returned on success. A negative errno code is returned
6591  *      on a failure to set up the device, or if the name is a duplicate.
6592  *
6593  *      Callers must hold the rtnl semaphore. You may want
6594  *      register_netdev() instead of this.
6595  *
6596  *      BUGS:
6597  *      The locking appears insufficient to guarantee two parallel registers
6598  *      will not get the same name.
6599  */
6600
6601 int register_netdevice(struct net_device *dev)
6602 {
6603         int ret;
6604         struct net *net = dev_net(dev);
6605
6606         BUG_ON(dev_boot_phase);
6607         ASSERT_RTNL();
6608
6609         might_sleep();
6610
6611         /* When net_device's are persistent, this will be fatal. */
6612         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6613         BUG_ON(!net);
6614
6615         spin_lock_init(&dev->addr_list_lock);
6616         netdev_set_addr_lockdep_class(dev);
6617
6618         ret = dev_get_valid_name(net, dev, dev->name);
6619         if (ret < 0)
6620                 goto out;
6621
6622         /* Init, if this function is available */
6623         if (dev->netdev_ops->ndo_init) {
6624                 ret = dev->netdev_ops->ndo_init(dev);
6625                 if (ret) {
6626                         if (ret > 0)
6627                                 ret = -EIO;
6628                         goto out;
6629                 }
6630         }
6631
6632         if (((dev->hw_features | dev->features) &
6633              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6634             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6635              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6636                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6637                 ret = -EINVAL;
6638                 goto err_uninit;
6639         }
6640
6641         ret = -EBUSY;
6642         if (!dev->ifindex)
6643                 dev->ifindex = dev_new_index(net);
6644         else if (__dev_get_by_index(net, dev->ifindex))
6645                 goto err_uninit;
6646
6647         /* Transfer changeable features to wanted_features and enable
6648          * software offloads (GSO and GRO).
6649          */
6650         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6651         dev->features |= NETIF_F_SOFT_FEATURES;
6652         dev->wanted_features = dev->features & dev->hw_features;
6653
6654         if (!(dev->flags & IFF_LOOPBACK)) {
6655                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6656         }
6657
6658         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6659          */
6660         dev->vlan_features |= NETIF_F_HIGHDMA;
6661
6662         /* Make NETIF_F_SG inheritable to tunnel devices.
6663          */
6664         dev->hw_enc_features |= NETIF_F_SG;
6665
6666         /* Make NETIF_F_SG inheritable to MPLS.
6667          */
6668         dev->mpls_features |= NETIF_F_SG;
6669
6670         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6671         ret = notifier_to_errno(ret);
6672         if (ret)
6673                 goto err_uninit;
6674
6675         ret = netdev_register_kobject(dev);
6676         if (ret)
6677                 goto err_uninit;
6678         dev->reg_state = NETREG_REGISTERED;
6679
6680         __netdev_update_features(dev);
6681
6682         /*
6683          *      Default initial state at registry is that the
6684          *      device is present.
6685          */
6686
6687         set_bit(__LINK_STATE_PRESENT, &dev->state);
6688
6689         linkwatch_init_dev(dev);
6690
6691         dev_init_scheduler(dev);
6692         dev_hold(dev);
6693         list_netdevice(dev);
6694         add_device_randomness(dev->dev_addr, dev->addr_len);
6695
6696         /* If the device has permanent device address, driver should
6697          * set dev_addr and also addr_assign_type should be set to
6698          * NET_ADDR_PERM (default value).
6699          */
6700         if (dev->addr_assign_type == NET_ADDR_PERM)
6701                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6702
6703         /* Notify protocols, that a new device appeared. */
6704         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6705         ret = notifier_to_errno(ret);
6706         if (ret) {
6707                 rollback_registered(dev);
6708                 dev->reg_state = NETREG_UNREGISTERED;
6709         }
6710         /*
6711          *      Prevent userspace races by waiting until the network
6712          *      device is fully setup before sending notifications.
6713          */
6714         if (!dev->rtnl_link_ops ||
6715             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6716                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6717
6718 out:
6719         return ret;
6720
6721 err_uninit:
6722         if (dev->netdev_ops->ndo_uninit)
6723                 dev->netdev_ops->ndo_uninit(dev);
6724         goto out;
6725 }
6726 EXPORT_SYMBOL(register_netdevice);
6727
6728 /**
6729  *      init_dummy_netdev       - init a dummy network device for NAPI
6730  *      @dev: device to init
6731  *
6732  *      This takes a network device structure and initialize the minimum
6733  *      amount of fields so it can be used to schedule NAPI polls without
6734  *      registering a full blown interface. This is to be used by drivers
6735  *      that need to tie several hardware interfaces to a single NAPI
6736  *      poll scheduler due to HW limitations.
6737  */
6738 int init_dummy_netdev(struct net_device *dev)
6739 {
6740         /* Clear everything. Note we don't initialize spinlocks
6741          * are they aren't supposed to be taken by any of the
6742          * NAPI code and this dummy netdev is supposed to be
6743          * only ever used for NAPI polls
6744          */
6745         memset(dev, 0, sizeof(struct net_device));
6746
6747         /* make sure we BUG if trying to hit standard
6748          * register/unregister code path
6749          */
6750         dev->reg_state = NETREG_DUMMY;
6751
6752         /* NAPI wants this */
6753         INIT_LIST_HEAD(&dev->napi_list);
6754
6755         /* a dummy interface is started by default */
6756         set_bit(__LINK_STATE_PRESENT, &dev->state);
6757         set_bit(__LINK_STATE_START, &dev->state);
6758
6759         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6760          * because users of this 'device' dont need to change
6761          * its refcount.
6762          */
6763
6764         return 0;
6765 }
6766 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6767
6768
6769 /**
6770  *      register_netdev - register a network device
6771  *      @dev: device to register
6772  *
6773  *      Take a completed network device structure and add it to the kernel
6774  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6775  *      chain. 0 is returned on success. A negative errno code is returned
6776  *      on a failure to set up the device, or if the name is a duplicate.
6777  *
6778  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6779  *      and expands the device name if you passed a format string to
6780  *      alloc_netdev.
6781  */
6782 int register_netdev(struct net_device *dev)
6783 {
6784         int err;
6785
6786         rtnl_lock();
6787         err = register_netdevice(dev);
6788         rtnl_unlock();
6789         return err;
6790 }
6791 EXPORT_SYMBOL(register_netdev);
6792
6793 int netdev_refcnt_read(const struct net_device *dev)
6794 {
6795         int i, refcnt = 0;
6796
6797         for_each_possible_cpu(i)
6798                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6799         return refcnt;
6800 }
6801 EXPORT_SYMBOL(netdev_refcnt_read);
6802
6803 /**
6804  * netdev_wait_allrefs - wait until all references are gone.
6805  * @dev: target net_device
6806  *
6807  * This is called when unregistering network devices.
6808  *
6809  * Any protocol or device that holds a reference should register
6810  * for netdevice notification, and cleanup and put back the
6811  * reference if they receive an UNREGISTER event.
6812  * We can get stuck here if buggy protocols don't correctly
6813  * call dev_put.
6814  */
6815 static void netdev_wait_allrefs(struct net_device *dev)
6816 {
6817         unsigned long rebroadcast_time, warning_time;
6818         int refcnt;
6819
6820         linkwatch_forget_dev(dev);
6821
6822         rebroadcast_time = warning_time = jiffies;
6823         refcnt = netdev_refcnt_read(dev);
6824
6825         while (refcnt != 0) {
6826                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6827                         rtnl_lock();
6828
6829                         /* Rebroadcast unregister notification */
6830                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6831
6832                         __rtnl_unlock();
6833                         rcu_barrier();
6834                         rtnl_lock();
6835
6836                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6837                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6838                                      &dev->state)) {
6839                                 /* We must not have linkwatch events
6840                                  * pending on unregister. If this
6841                                  * happens, we simply run the queue
6842                                  * unscheduled, resulting in a noop
6843                                  * for this device.
6844                                  */
6845                                 linkwatch_run_queue();
6846                         }
6847
6848                         __rtnl_unlock();
6849
6850                         rebroadcast_time = jiffies;
6851                 }
6852
6853                 msleep(250);
6854
6855                 refcnt = netdev_refcnt_read(dev);
6856
6857                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6858                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6859                                  dev->name, refcnt);
6860                         warning_time = jiffies;
6861                 }
6862         }
6863 }
6864
6865 /* The sequence is:
6866  *
6867  *      rtnl_lock();
6868  *      ...
6869  *      register_netdevice(x1);
6870  *      register_netdevice(x2);
6871  *      ...
6872  *      unregister_netdevice(y1);
6873  *      unregister_netdevice(y2);
6874  *      ...
6875  *      rtnl_unlock();
6876  *      free_netdev(y1);
6877  *      free_netdev(y2);
6878  *
6879  * We are invoked by rtnl_unlock().
6880  * This allows us to deal with problems:
6881  * 1) We can delete sysfs objects which invoke hotplug
6882  *    without deadlocking with linkwatch via keventd.
6883  * 2) Since we run with the RTNL semaphore not held, we can sleep
6884  *    safely in order to wait for the netdev refcnt to drop to zero.
6885  *
6886  * We must not return until all unregister events added during
6887  * the interval the lock was held have been completed.
6888  */
6889 void netdev_run_todo(void)
6890 {
6891         struct list_head list;
6892
6893         /* Snapshot list, allow later requests */
6894         list_replace_init(&net_todo_list, &list);
6895
6896         __rtnl_unlock();
6897
6898
6899         /* Wait for rcu callbacks to finish before next phase */
6900         if (!list_empty(&list))
6901                 rcu_barrier();
6902
6903         while (!list_empty(&list)) {
6904                 struct net_device *dev
6905                         = list_first_entry(&list, struct net_device, todo_list);
6906                 list_del(&dev->todo_list);
6907
6908                 rtnl_lock();
6909                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6910                 __rtnl_unlock();
6911
6912                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6913                         pr_err("network todo '%s' but state %d\n",
6914                                dev->name, dev->reg_state);
6915                         dump_stack();
6916                         continue;
6917                 }
6918
6919                 dev->reg_state = NETREG_UNREGISTERED;
6920
6921                 netdev_wait_allrefs(dev);
6922
6923                 /* paranoia */
6924                 BUG_ON(netdev_refcnt_read(dev));
6925                 BUG_ON(!list_empty(&dev->ptype_all));
6926                 BUG_ON(!list_empty(&dev->ptype_specific));
6927                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6928                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6929                 WARN_ON(dev->dn_ptr);
6930
6931                 if (dev->destructor)
6932                         dev->destructor(dev);
6933
6934                 /* Report a network device has been unregistered */
6935                 rtnl_lock();
6936                 dev_net(dev)->dev_unreg_count--;
6937                 __rtnl_unlock();
6938                 wake_up(&netdev_unregistering_wq);
6939
6940                 /* Free network device */
6941                 kobject_put(&dev->dev.kobj);
6942         }
6943 }
6944
6945 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6946  * fields in the same order, with only the type differing.
6947  */
6948 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6949                              const struct net_device_stats *netdev_stats)
6950 {
6951 #if BITS_PER_LONG == 64
6952         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6953         memcpy(stats64, netdev_stats, sizeof(*stats64));
6954 #else
6955         size_t i, n = sizeof(*stats64) / sizeof(u64);
6956         const unsigned long *src = (const unsigned long *)netdev_stats;
6957         u64 *dst = (u64 *)stats64;
6958
6959         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6960                      sizeof(*stats64) / sizeof(u64));
6961         for (i = 0; i < n; i++)
6962                 dst[i] = src[i];
6963 #endif
6964 }
6965 EXPORT_SYMBOL(netdev_stats_to_stats64);
6966
6967 /**
6968  *      dev_get_stats   - get network device statistics
6969  *      @dev: device to get statistics from
6970  *      @storage: place to store stats
6971  *
6972  *      Get network statistics from device. Return @storage.
6973  *      The device driver may provide its own method by setting
6974  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6975  *      otherwise the internal statistics structure is used.
6976  */
6977 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6978                                         struct rtnl_link_stats64 *storage)
6979 {
6980         const struct net_device_ops *ops = dev->netdev_ops;
6981
6982         if (ops->ndo_get_stats64) {
6983                 memset(storage, 0, sizeof(*storage));
6984                 ops->ndo_get_stats64(dev, storage);
6985         } else if (ops->ndo_get_stats) {
6986                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6987         } else {
6988                 netdev_stats_to_stats64(storage, &dev->stats);
6989         }
6990         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6991         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6992         return storage;
6993 }
6994 EXPORT_SYMBOL(dev_get_stats);
6995
6996 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6997 {
6998         struct netdev_queue *queue = dev_ingress_queue(dev);
6999
7000 #ifdef CONFIG_NET_CLS_ACT
7001         if (queue)
7002                 return queue;
7003         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7004         if (!queue)
7005                 return NULL;
7006         netdev_init_one_queue(dev, queue, NULL);
7007         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7008         queue->qdisc_sleeping = &noop_qdisc;
7009         rcu_assign_pointer(dev->ingress_queue, queue);
7010 #endif
7011         return queue;
7012 }
7013
7014 static const struct ethtool_ops default_ethtool_ops;
7015
7016 void netdev_set_default_ethtool_ops(struct net_device *dev,
7017                                     const struct ethtool_ops *ops)
7018 {
7019         if (dev->ethtool_ops == &default_ethtool_ops)
7020                 dev->ethtool_ops = ops;
7021 }
7022 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7023
7024 void netdev_freemem(struct net_device *dev)
7025 {
7026         char *addr = (char *)dev - dev->padded;
7027
7028         kvfree(addr);
7029 }
7030
7031 /**
7032  *      alloc_netdev_mqs - allocate network device
7033  *      @sizeof_priv:           size of private data to allocate space for
7034  *      @name:                  device name format string
7035  *      @name_assign_type:      origin of device name
7036  *      @setup:                 callback to initialize device
7037  *      @txqs:                  the number of TX subqueues to allocate
7038  *      @rxqs:                  the number of RX subqueues to allocate
7039  *
7040  *      Allocates a struct net_device with private data area for driver use
7041  *      and performs basic initialization.  Also allocates subqueue structs
7042  *      for each queue on the device.
7043  */
7044 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7045                 unsigned char name_assign_type,
7046                 void (*setup)(struct net_device *),
7047                 unsigned int txqs, unsigned int rxqs)
7048 {
7049         struct net_device *dev;
7050         size_t alloc_size;
7051         struct net_device *p;
7052
7053         BUG_ON(strlen(name) >= sizeof(dev->name));
7054
7055         if (txqs < 1) {
7056                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7057                 return NULL;
7058         }
7059
7060 #ifdef CONFIG_SYSFS
7061         if (rxqs < 1) {
7062                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7063                 return NULL;
7064         }
7065 #endif
7066
7067         alloc_size = sizeof(struct net_device);
7068         if (sizeof_priv) {
7069                 /* ensure 32-byte alignment of private area */
7070                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7071                 alloc_size += sizeof_priv;
7072         }
7073         /* ensure 32-byte alignment of whole construct */
7074         alloc_size += NETDEV_ALIGN - 1;
7075
7076         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7077         if (!p)
7078                 p = vzalloc(alloc_size);
7079         if (!p)
7080                 return NULL;
7081
7082         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7083         dev->padded = (char *)dev - (char *)p;
7084
7085         dev->pcpu_refcnt = alloc_percpu(int);
7086         if (!dev->pcpu_refcnt)
7087                 goto free_dev;
7088
7089         if (dev_addr_init(dev))
7090                 goto free_pcpu;
7091
7092         dev_mc_init(dev);
7093         dev_uc_init(dev);
7094
7095         dev_net_set(dev, &init_net);
7096
7097         dev->gso_max_size = GSO_MAX_SIZE;
7098         dev->gso_max_segs = GSO_MAX_SEGS;
7099         dev->gso_min_segs = 0;
7100
7101         INIT_LIST_HEAD(&dev->napi_list);
7102         INIT_LIST_HEAD(&dev->unreg_list);
7103         INIT_LIST_HEAD(&dev->close_list);
7104         INIT_LIST_HEAD(&dev->link_watch_list);
7105         INIT_LIST_HEAD(&dev->adj_list.upper);
7106         INIT_LIST_HEAD(&dev->adj_list.lower);
7107         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7108         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7109         INIT_LIST_HEAD(&dev->ptype_all);
7110         INIT_LIST_HEAD(&dev->ptype_specific);
7111         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7112         setup(dev);
7113
7114         if (!dev->tx_queue_len)
7115                 dev->priv_flags |= IFF_NO_QUEUE;
7116
7117         dev->num_tx_queues = txqs;
7118         dev->real_num_tx_queues = txqs;
7119         if (netif_alloc_netdev_queues(dev))
7120                 goto free_all;
7121
7122 #ifdef CONFIG_SYSFS
7123         dev->num_rx_queues = rxqs;
7124         dev->real_num_rx_queues = rxqs;
7125         if (netif_alloc_rx_queues(dev))
7126                 goto free_all;
7127 #endif
7128
7129         strcpy(dev->name, name);
7130         dev->name_assign_type = name_assign_type;
7131         dev->group = INIT_NETDEV_GROUP;
7132         if (!dev->ethtool_ops)
7133                 dev->ethtool_ops = &default_ethtool_ops;
7134
7135         nf_hook_ingress_init(dev);
7136
7137         return dev;
7138
7139 free_all:
7140         free_netdev(dev);
7141         return NULL;
7142
7143 free_pcpu:
7144         free_percpu(dev->pcpu_refcnt);
7145 free_dev:
7146         netdev_freemem(dev);
7147         return NULL;
7148 }
7149 EXPORT_SYMBOL(alloc_netdev_mqs);
7150
7151 /**
7152  *      free_netdev - free network device
7153  *      @dev: device
7154  *
7155  *      This function does the last stage of destroying an allocated device
7156  *      interface. The reference to the device object is released.
7157  *      If this is the last reference then it will be freed.
7158  */
7159 void free_netdev(struct net_device *dev)
7160 {
7161         struct napi_struct *p, *n;
7162
7163         netif_free_tx_queues(dev);
7164 #ifdef CONFIG_SYSFS
7165         kvfree(dev->_rx);
7166 #endif
7167
7168         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7169
7170         /* Flush device addresses */
7171         dev_addr_flush(dev);
7172
7173         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7174                 netif_napi_del(p);
7175
7176         free_percpu(dev->pcpu_refcnt);
7177         dev->pcpu_refcnt = NULL;
7178
7179         /*  Compatibility with error handling in drivers */
7180         if (dev->reg_state == NETREG_UNINITIALIZED) {
7181                 netdev_freemem(dev);
7182                 return;
7183         }
7184
7185         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7186         dev->reg_state = NETREG_RELEASED;
7187
7188         /* will free via device release */
7189         put_device(&dev->dev);
7190 }
7191 EXPORT_SYMBOL(free_netdev);
7192
7193 /**
7194  *      synchronize_net -  Synchronize with packet receive processing
7195  *
7196  *      Wait for packets currently being received to be done.
7197  *      Does not block later packets from starting.
7198  */
7199 void synchronize_net(void)
7200 {
7201         might_sleep();
7202         if (rtnl_is_locked())
7203                 synchronize_rcu_expedited();
7204         else
7205                 synchronize_rcu();
7206 }
7207 EXPORT_SYMBOL(synchronize_net);
7208
7209 /**
7210  *      unregister_netdevice_queue - remove device from the kernel
7211  *      @dev: device
7212  *      @head: list
7213  *
7214  *      This function shuts down a device interface and removes it
7215  *      from the kernel tables.
7216  *      If head not NULL, device is queued to be unregistered later.
7217  *
7218  *      Callers must hold the rtnl semaphore.  You may want
7219  *      unregister_netdev() instead of this.
7220  */
7221
7222 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7223 {
7224         ASSERT_RTNL();
7225
7226         if (head) {
7227                 list_move_tail(&dev->unreg_list, head);
7228         } else {
7229                 rollback_registered(dev);
7230                 /* Finish processing unregister after unlock */
7231                 net_set_todo(dev);
7232         }
7233 }
7234 EXPORT_SYMBOL(unregister_netdevice_queue);
7235
7236 /**
7237  *      unregister_netdevice_many - unregister many devices
7238  *      @head: list of devices
7239  *
7240  *  Note: As most callers use a stack allocated list_head,
7241  *  we force a list_del() to make sure stack wont be corrupted later.
7242  */
7243 void unregister_netdevice_many(struct list_head *head)
7244 {
7245         struct net_device *dev;
7246
7247         if (!list_empty(head)) {
7248                 rollback_registered_many(head);
7249                 list_for_each_entry(dev, head, unreg_list)
7250                         net_set_todo(dev);
7251                 list_del(head);
7252         }
7253 }
7254 EXPORT_SYMBOL(unregister_netdevice_many);
7255
7256 /**
7257  *      unregister_netdev - remove device from the kernel
7258  *      @dev: device
7259  *
7260  *      This function shuts down a device interface and removes it
7261  *      from the kernel tables.
7262  *
7263  *      This is just a wrapper for unregister_netdevice that takes
7264  *      the rtnl semaphore.  In general you want to use this and not
7265  *      unregister_netdevice.
7266  */
7267 void unregister_netdev(struct net_device *dev)
7268 {
7269         rtnl_lock();
7270         unregister_netdevice(dev);
7271         rtnl_unlock();
7272 }
7273 EXPORT_SYMBOL(unregister_netdev);
7274
7275 /**
7276  *      dev_change_net_namespace - move device to different nethost namespace
7277  *      @dev: device
7278  *      @net: network namespace
7279  *      @pat: If not NULL name pattern to try if the current device name
7280  *            is already taken in the destination network namespace.
7281  *
7282  *      This function shuts down a device interface and moves it
7283  *      to a new network namespace. On success 0 is returned, on
7284  *      a failure a netagive errno code is returned.
7285  *
7286  *      Callers must hold the rtnl semaphore.
7287  */
7288
7289 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7290 {
7291         int err;
7292
7293         ASSERT_RTNL();
7294
7295         /* Don't allow namespace local devices to be moved. */
7296         err = -EINVAL;
7297         if (dev->features & NETIF_F_NETNS_LOCAL)
7298                 goto out;
7299
7300         /* Ensure the device has been registrered */
7301         if (dev->reg_state != NETREG_REGISTERED)
7302                 goto out;
7303
7304         /* Get out if there is nothing todo */
7305         err = 0;
7306         if (net_eq(dev_net(dev), net))
7307                 goto out;
7308
7309         /* Pick the destination device name, and ensure
7310          * we can use it in the destination network namespace.
7311          */
7312         err = -EEXIST;
7313         if (__dev_get_by_name(net, dev->name)) {
7314                 /* We get here if we can't use the current device name */
7315                 if (!pat)
7316                         goto out;
7317                 if (dev_get_valid_name(net, dev, pat) < 0)
7318                         goto out;
7319         }
7320
7321         /*
7322          * And now a mini version of register_netdevice unregister_netdevice.
7323          */
7324
7325         /* If device is running close it first. */
7326         dev_close(dev);
7327
7328         /* And unlink it from device chain */
7329         err = -ENODEV;
7330         unlist_netdevice(dev);
7331
7332         synchronize_net();
7333
7334         /* Shutdown queueing discipline. */
7335         dev_shutdown(dev);
7336
7337         /* Notify protocols, that we are about to destroy
7338            this device. They should clean all the things.
7339
7340            Note that dev->reg_state stays at NETREG_REGISTERED.
7341            This is wanted because this way 8021q and macvlan know
7342            the device is just moving and can keep their slaves up.
7343         */
7344         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7345         rcu_barrier();
7346         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7347         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7348
7349         /*
7350          *      Flush the unicast and multicast chains
7351          */
7352         dev_uc_flush(dev);
7353         dev_mc_flush(dev);
7354
7355         /* Send a netdev-removed uevent to the old namespace */
7356         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7357         netdev_adjacent_del_links(dev);
7358
7359         /* Actually switch the network namespace */
7360         dev_net_set(dev, net);
7361
7362         /* If there is an ifindex conflict assign a new one */
7363         if (__dev_get_by_index(net, dev->ifindex))
7364                 dev->ifindex = dev_new_index(net);
7365
7366         /* Send a netdev-add uevent to the new namespace */
7367         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7368         netdev_adjacent_add_links(dev);
7369
7370         /* Fixup kobjects */
7371         err = device_rename(&dev->dev, dev->name);
7372         WARN_ON(err);
7373
7374         /* Add the device back in the hashes */
7375         list_netdevice(dev);
7376
7377         /* Notify protocols, that a new device appeared. */
7378         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7379
7380         /*
7381          *      Prevent userspace races by waiting until the network
7382          *      device is fully setup before sending notifications.
7383          */
7384         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7385
7386         synchronize_net();
7387         err = 0;
7388 out:
7389         return err;
7390 }
7391 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7392
7393 static int dev_cpu_callback(struct notifier_block *nfb,
7394                             unsigned long action,
7395                             void *ocpu)
7396 {
7397         struct sk_buff **list_skb;
7398         struct sk_buff *skb;
7399         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7400         struct softnet_data *sd, *oldsd;
7401
7402         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7403                 return NOTIFY_OK;
7404
7405         local_irq_disable();
7406         cpu = smp_processor_id();
7407         sd = &per_cpu(softnet_data, cpu);
7408         oldsd = &per_cpu(softnet_data, oldcpu);
7409
7410         /* Find end of our completion_queue. */
7411         list_skb = &sd->completion_queue;
7412         while (*list_skb)
7413                 list_skb = &(*list_skb)->next;
7414         /* Append completion queue from offline CPU. */
7415         *list_skb = oldsd->completion_queue;
7416         oldsd->completion_queue = NULL;
7417
7418         /* Append output queue from offline CPU. */
7419         if (oldsd->output_queue) {
7420                 *sd->output_queue_tailp = oldsd->output_queue;
7421                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7422                 oldsd->output_queue = NULL;
7423                 oldsd->output_queue_tailp = &oldsd->output_queue;
7424         }
7425         /* Append NAPI poll list from offline CPU, with one exception :
7426          * process_backlog() must be called by cpu owning percpu backlog.
7427          * We properly handle process_queue & input_pkt_queue later.
7428          */
7429         while (!list_empty(&oldsd->poll_list)) {
7430                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7431                                                             struct napi_struct,
7432                                                             poll_list);
7433
7434                 list_del_init(&napi->poll_list);
7435                 if (napi->poll == process_backlog)
7436                         napi->state = 0;
7437                 else
7438                         ____napi_schedule(sd, napi);
7439         }
7440
7441         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7442         local_irq_enable();
7443
7444         /* Process offline CPU's input_pkt_queue */
7445         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7446                 netif_rx_ni(skb);
7447                 input_queue_head_incr(oldsd);
7448         }
7449         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7450                 netif_rx_ni(skb);
7451                 input_queue_head_incr(oldsd);
7452         }
7453
7454         return NOTIFY_OK;
7455 }
7456
7457
7458 /**
7459  *      netdev_increment_features - increment feature set by one
7460  *      @all: current feature set
7461  *      @one: new feature set
7462  *      @mask: mask feature set
7463  *
7464  *      Computes a new feature set after adding a device with feature set
7465  *      @one to the master device with current feature set @all.  Will not
7466  *      enable anything that is off in @mask. Returns the new feature set.
7467  */
7468 netdev_features_t netdev_increment_features(netdev_features_t all,
7469         netdev_features_t one, netdev_features_t mask)
7470 {
7471         if (mask & NETIF_F_GEN_CSUM)
7472                 mask |= NETIF_F_ALL_CSUM;
7473         mask |= NETIF_F_VLAN_CHALLENGED;
7474
7475         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7476         all &= one | ~NETIF_F_ALL_FOR_ALL;
7477
7478         /* If one device supports hw checksumming, set for all. */
7479         if (all & NETIF_F_GEN_CSUM)
7480                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7481
7482         return all;
7483 }
7484 EXPORT_SYMBOL(netdev_increment_features);
7485
7486 static struct hlist_head * __net_init netdev_create_hash(void)
7487 {
7488         int i;
7489         struct hlist_head *hash;
7490
7491         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7492         if (hash != NULL)
7493                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7494                         INIT_HLIST_HEAD(&hash[i]);
7495
7496         return hash;
7497 }
7498
7499 /* Initialize per network namespace state */
7500 static int __net_init netdev_init(struct net *net)
7501 {
7502         if (net != &init_net)
7503                 INIT_LIST_HEAD(&net->dev_base_head);
7504
7505         net->dev_name_head = netdev_create_hash();
7506         if (net->dev_name_head == NULL)
7507                 goto err_name;
7508
7509         net->dev_index_head = netdev_create_hash();
7510         if (net->dev_index_head == NULL)
7511                 goto err_idx;
7512
7513         return 0;
7514
7515 err_idx:
7516         kfree(net->dev_name_head);
7517 err_name:
7518         return -ENOMEM;
7519 }
7520
7521 /**
7522  *      netdev_drivername - network driver for the device
7523  *      @dev: network device
7524  *
7525  *      Determine network driver for device.
7526  */
7527 const char *netdev_drivername(const struct net_device *dev)
7528 {
7529         const struct device_driver *driver;
7530         const struct device *parent;
7531         const char *empty = "";
7532
7533         parent = dev->dev.parent;
7534         if (!parent)
7535                 return empty;
7536
7537         driver = parent->driver;
7538         if (driver && driver->name)
7539                 return driver->name;
7540         return empty;
7541 }
7542
7543 static void __netdev_printk(const char *level, const struct net_device *dev,
7544                             struct va_format *vaf)
7545 {
7546         if (dev && dev->dev.parent) {
7547                 dev_printk_emit(level[1] - '0',
7548                                 dev->dev.parent,
7549                                 "%s %s %s%s: %pV",
7550                                 dev_driver_string(dev->dev.parent),
7551                                 dev_name(dev->dev.parent),
7552                                 netdev_name(dev), netdev_reg_state(dev),
7553                                 vaf);
7554         } else if (dev) {
7555                 printk("%s%s%s: %pV",
7556                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7557         } else {
7558                 printk("%s(NULL net_device): %pV", level, vaf);
7559         }
7560 }
7561
7562 void netdev_printk(const char *level, const struct net_device *dev,
7563                    const char *format, ...)
7564 {
7565         struct va_format vaf;
7566         va_list args;
7567
7568         va_start(args, format);
7569
7570         vaf.fmt = format;
7571         vaf.va = &args;
7572
7573         __netdev_printk(level, dev, &vaf);
7574
7575         va_end(args);
7576 }
7577 EXPORT_SYMBOL(netdev_printk);
7578
7579 #define define_netdev_printk_level(func, level)                 \
7580 void func(const struct net_device *dev, const char *fmt, ...)   \
7581 {                                                               \
7582         struct va_format vaf;                                   \
7583         va_list args;                                           \
7584                                                                 \
7585         va_start(args, fmt);                                    \
7586                                                                 \
7587         vaf.fmt = fmt;                                          \
7588         vaf.va = &args;                                         \
7589                                                                 \
7590         __netdev_printk(level, dev, &vaf);                      \
7591                                                                 \
7592         va_end(args);                                           \
7593 }                                                               \
7594 EXPORT_SYMBOL(func);
7595
7596 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7597 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7598 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7599 define_netdev_printk_level(netdev_err, KERN_ERR);
7600 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7601 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7602 define_netdev_printk_level(netdev_info, KERN_INFO);
7603
7604 static void __net_exit netdev_exit(struct net *net)
7605 {
7606         kfree(net->dev_name_head);
7607         kfree(net->dev_index_head);
7608 }
7609
7610 static struct pernet_operations __net_initdata netdev_net_ops = {
7611         .init = netdev_init,
7612         .exit = netdev_exit,
7613 };
7614
7615 static void __net_exit default_device_exit(struct net *net)
7616 {
7617         struct net_device *dev, *aux;
7618         /*
7619          * Push all migratable network devices back to the
7620          * initial network namespace
7621          */
7622         rtnl_lock();
7623         for_each_netdev_safe(net, dev, aux) {
7624                 int err;
7625                 char fb_name[IFNAMSIZ];
7626
7627                 /* Ignore unmoveable devices (i.e. loopback) */
7628                 if (dev->features & NETIF_F_NETNS_LOCAL)
7629                         continue;
7630
7631                 /* Leave virtual devices for the generic cleanup */
7632                 if (dev->rtnl_link_ops)
7633                         continue;
7634
7635                 /* Push remaining network devices to init_net */
7636                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7637                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7638                 if (err) {
7639                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7640                                  __func__, dev->name, err);
7641                         BUG();
7642                 }
7643         }
7644         rtnl_unlock();
7645 }
7646
7647 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7648 {
7649         /* Return with the rtnl_lock held when there are no network
7650          * devices unregistering in any network namespace in net_list.
7651          */
7652         struct net *net;
7653         bool unregistering;
7654         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7655
7656         add_wait_queue(&netdev_unregistering_wq, &wait);
7657         for (;;) {
7658                 unregistering = false;
7659                 rtnl_lock();
7660                 list_for_each_entry(net, net_list, exit_list) {
7661                         if (net->dev_unreg_count > 0) {
7662                                 unregistering = true;
7663                                 break;
7664                         }
7665                 }
7666                 if (!unregistering)
7667                         break;
7668                 __rtnl_unlock();
7669
7670                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7671         }
7672         remove_wait_queue(&netdev_unregistering_wq, &wait);
7673 }
7674
7675 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7676 {
7677         /* At exit all network devices most be removed from a network
7678          * namespace.  Do this in the reverse order of registration.
7679          * Do this across as many network namespaces as possible to
7680          * improve batching efficiency.
7681          */
7682         struct net_device *dev;
7683         struct net *net;
7684         LIST_HEAD(dev_kill_list);
7685
7686         /* To prevent network device cleanup code from dereferencing
7687          * loopback devices or network devices that have been freed
7688          * wait here for all pending unregistrations to complete,
7689          * before unregistring the loopback device and allowing the
7690          * network namespace be freed.
7691          *
7692          * The netdev todo list containing all network devices
7693          * unregistrations that happen in default_device_exit_batch
7694          * will run in the rtnl_unlock() at the end of
7695          * default_device_exit_batch.
7696          */
7697         rtnl_lock_unregistering(net_list);
7698         list_for_each_entry(net, net_list, exit_list) {
7699                 for_each_netdev_reverse(net, dev) {
7700                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7701                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7702                         else
7703                                 unregister_netdevice_queue(dev, &dev_kill_list);
7704                 }
7705         }
7706         unregister_netdevice_many(&dev_kill_list);
7707         rtnl_unlock();
7708 }
7709
7710 static struct pernet_operations __net_initdata default_device_ops = {
7711         .exit = default_device_exit,
7712         .exit_batch = default_device_exit_batch,
7713 };
7714
7715 /*
7716  *      Initialize the DEV module. At boot time this walks the device list and
7717  *      unhooks any devices that fail to initialise (normally hardware not
7718  *      present) and leaves us with a valid list of present and active devices.
7719  *
7720  */
7721
7722 /*
7723  *       This is called single threaded during boot, so no need
7724  *       to take the rtnl semaphore.
7725  */
7726 static int __init net_dev_init(void)
7727 {
7728         int i, rc = -ENOMEM;
7729
7730         BUG_ON(!dev_boot_phase);
7731
7732         if (dev_proc_init())
7733                 goto out;
7734
7735         if (netdev_kobject_init())
7736                 goto out;
7737
7738         INIT_LIST_HEAD(&ptype_all);
7739         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7740                 INIT_LIST_HEAD(&ptype_base[i]);
7741
7742         INIT_LIST_HEAD(&offload_base);
7743
7744         if (register_pernet_subsys(&netdev_net_ops))
7745                 goto out;
7746
7747         /*
7748          *      Initialise the packet receive queues.
7749          */
7750
7751         for_each_possible_cpu(i) {
7752                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7753
7754                 skb_queue_head_init(&sd->input_pkt_queue);
7755                 skb_queue_head_init(&sd->process_queue);
7756                 INIT_LIST_HEAD(&sd->poll_list);
7757                 sd->output_queue_tailp = &sd->output_queue;
7758 #ifdef CONFIG_RPS
7759                 sd->csd.func = rps_trigger_softirq;
7760                 sd->csd.info = sd;
7761                 sd->cpu = i;
7762 #endif
7763
7764                 sd->backlog.poll = process_backlog;
7765                 sd->backlog.weight = weight_p;
7766         }
7767
7768         dev_boot_phase = 0;
7769
7770         /* The loopback device is special if any other network devices
7771          * is present in a network namespace the loopback device must
7772          * be present. Since we now dynamically allocate and free the
7773          * loopback device ensure this invariant is maintained by
7774          * keeping the loopback device as the first device on the
7775          * list of network devices.  Ensuring the loopback devices
7776          * is the first device that appears and the last network device
7777          * that disappears.
7778          */
7779         if (register_pernet_device(&loopback_net_ops))
7780                 goto out;
7781
7782         if (register_pernet_device(&default_device_ops))
7783                 goto out;
7784
7785         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7786         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7787
7788         hotcpu_notifier(dev_cpu_callback, 0);
7789         dst_subsys_init();
7790         rc = 0;
7791 out:
7792         return rc;
7793 }
7794
7795 subsys_initcall(net_dev_init);